検証結果
# |
クエリ |
実行時間 |
I/O量 |
1 |
select count(*) from amazon_reviews_parquet |
5.6秒 |
0KB |
2 |
select count(year) from amazon_reviews_parquet |
6.63秒 |
2.58MB |
3 |
select count(review_body) from amazon_reviews_parquet |
5.7秒 |
34.05GB |
4 |
select * from amazon_reviews_parquet limit 10000 |
2.05秒 |
455.11MB |
5 |
select year from amazon_reviews_parquet limit 10000 |
0.92秒 |
163.79KB |
6 |
select review_body from amazon_reviews_parquet limit 10000 |
0.54秒 |
3.25MB |
準備手順
$ aws s3 mb s3://amazon-reviews-pds-az
$ aws s3 cp --recursive s3://amazon-reviews-pds/ s3://amazon-reviews-pds-az
CREATE EXTERNAL TABLE amazon_reviews_parquet(
marketplace string,
customer_id string,
review_id string,
product_id string,
product_parent string,
product_title string,
star_rating int,
helpful_votes int,
total_votes int,
vine string,
verified_purchase string,
review_headline string,
review_body string,
review_date bigint,
year int)
PARTITIONED BY (product_category string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
's3://amazon-reviews-pds-az/parquet/'
MSCK REPAIR TABLE amazon_reviews_parquet
情報採取
- select count(*) from amazon_reviews_parquet
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete
where eventsource = 's3.amazonaws.com'
and useragent = 'athena.amazonaws.com'
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T07:12:00Z'
and eventtime < '2018-09-23T07:35:00Z'
group by eventname
order by apicall_count desc
eventname |
apicall_count |
GetObject |
2230 |
HeadObject |
2203 |
HeadBucket |
176 |
ListObjects |
43 |
PutObject |
2 |
- select count(year) from amazon_reviews_parquet
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete
where eventsource = 's3.amazonaws.com'
and useragent = 'athena.amazonaws.com'
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T07:35:00Z'
and eventtime < '2018-09-23T07:55:00Z'
group by eventname
order by apicall_count desc
eventname |
apicall_count |
GetObject |
2894 |
HeadObject |
2238 |
HeadBucket |
229 |
ListObjects |
43 |
PutObject |
2 |
- select count(review_body) from amazon_reviews_parquet
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete
where eventsource = 's3.amazonaws.com'
and useragent = 'athena.amazonaws.com'
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T07:50:00Z'
and eventtime < '2018-09-23T08:00:00Z'
group by eventname
order by apicall_count desc
eventname |
apicall_count |
GetObject |
3027 |
HeadObject |
2248 |
HeadBucket |
668 |
ListObjects |
43 |
PutObject |
2 |
- select * from amazon_reviews_parquet limit 10000
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete
where eventsource = 's3.amazonaws.com'
and useragent = 'athena.amazonaws.com'
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T08:10:00Z'
and eventtime < '2018-09-23T08:20:00Z'
group by eventname
order by apicall_count desc
eventname |
apicall_count |
GetObject |
725 |
HeadObject |
607 |
HeadBucket |
105 |
ListObjects |
43 |
PutObject |
2 |
- select year from amazon_reviews_parquet limit 10000
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete
where eventsource = 's3.amazonaws.com'
and useragent = 'athena.amazonaws.com'
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T08:20:00Z'
and eventtime < '2018-09-23T08:30:00Z'
group by eventname
order by apicall_count desc
eventname |
apicall_count |
HeadObject |
2095 |
GetObject |
1934 |
HeadBucket |
327 |
ListObjects |
43 |
PutObject |
2 |
- select review_body from amazon_reviews_parquet limit 10000
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete
where eventsource = 's3.amazonaws.com'
and useragent = 'athena.amazonaws.com'
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T08:40:00Z'
and eventtime < '2018-09-23T08:59:00Z'
group by eventname
order by apicall_count desc
eventname |
apicall_count |
GetObject |
1153 |
HeadObject |
1117 |
HeadBucket |
500 |
ListObjects |
43 |
PutObject |
2 |