ablog

不器用で落着きのない技術者のメモ

Parquet

検証結果

  • Athena
# クエリ 実行時間 I/O量
1 select count(*) from amazon_reviews_parquet 5.6秒 0KB
2 select count(year) from amazon_reviews_parquet 6.63秒 2.58MB
3 select count(review_body) from amazon_reviews_parquet 5.7秒 34.05GB
4 select * from amazon_reviews_parquet limit 10000 2.05秒 455.11MB
5 select year from amazon_reviews_parquet limit 10000 0.92秒 163.79KB
6 select review_body from amazon_reviews_parquet limit 10000 0.54秒 3.25MB


準備手順

$ aws s3 mb s3://amazon-reviews-pds-az
$ aws s3 cp --recursive s3://amazon-reviews-pds/ s3://amazon-reviews-pds-az
CREATE EXTERNAL TABLE amazon_reviews_parquet(
  marketplace string, 
  customer_id string, 
  review_id string, 
  product_id string, 
  product_parent string, 
  product_title string, 
  star_rating int, 
  helpful_votes int, 
  total_votes int, 
  vine string, 
  verified_purchase string, 
  review_headline string, 
  review_body string, 
  review_date bigint, 
  year int)
PARTITIONED BY (product_category string)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
  's3://amazon-reviews-pds-az/parquet/'
MSCK REPAIR TABLE  amazon_reviews_parquet

情報採取

  • select count(*) from amazon_reviews_parquet
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete 
where eventsource = 's3.amazonaws.com' 
and useragent = 'athena.amazonaws.com' 
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T07:12:00Z'
and eventtime < '2018-09-23T07:35:00Z'
group by eventname
order by apicall_count desc
eventname apicall_count
GetObject 2230
HeadObject 2203
HeadBucket 176
ListObjects 43
PutObject 2
  • select count(year) from amazon_reviews_parquet
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete 
where eventsource = 's3.amazonaws.com' 
and useragent = 'athena.amazonaws.com' 
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T07:35:00Z'
and eventtime < '2018-09-23T07:55:00Z'
group by eventname
order by apicall_count desc
eventname apicall_count
GetObject 2894
HeadObject 2238
HeadBucket 229
ListObjects 43
PutObject 2
  • select count(review_body) from amazon_reviews_parquet
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete 
where eventsource = 's3.amazonaws.com' 
and useragent = 'athena.amazonaws.com' 
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T07:50:00Z'
and eventtime < '2018-09-23T08:00:00Z'
group by eventname
order by apicall_count desc
eventname apicall_count
GetObject 3027
HeadObject 2248
HeadBucket 668
ListObjects 43
PutObject 2
  • select * from amazon_reviews_parquet limit 10000
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete 
where eventsource = 's3.amazonaws.com' 
and useragent = 'athena.amazonaws.com' 
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T08:10:00Z'
and eventtime < '2018-09-23T08:20:00Z'
group by eventname
order by apicall_count desc
eventname apicall_count
GetObject 725
HeadObject 607
HeadBucket 105
ListObjects 43
PutObject 2
  • select year from amazon_reviews_parquet limit 10000
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete 
where eventsource = 's3.amazonaws.com' 
and useragent = 'athena.amazonaws.com' 
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T08:20:00Z'
and eventtime < '2018-09-23T08:30:00Z'
group by eventname
order by apicall_count desc
eventname apicall_count
HeadObject 2095
GetObject 1934
HeadBucket 327
ListObjects 43
PutObject 2
  • select review_body from amazon_reviews_parquet limit 10000
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete 
where eventsource = 's3.amazonaws.com' 
and useragent = 'athena.amazonaws.com' 
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T08:40:00Z'
and eventtime < '2018-09-23T08:59:00Z'
group by eventname
order by apicall_count desc
eventname apicall_count
GetObject 1153
HeadObject 1117
HeadBucket 500
ListObjects 43
PutObject 2

検証パターン