$ s3-dist-cp --src s3://amazon-reviews-pds/parquet/ --dest /amazon-reviews-pds/parquet/
$ hadoop fs -ls /amazon-reviews-pds/parquet/|head -10
Found 43 items
drwxr-xr-x - hadoop hadoop 0 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Apparel
drwxr-xr-x - hadoop hadoop 0 2018-09-29 20:13 /amazon-reviews-pds/parquet/product_category=Automotive
drwxr-xr-x - hadoop hadoop 0 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Baby
drwxr-xr-x - hadoop hadoop 0 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Beauty
drwxr-xr-x - hadoop hadoop 0 2018-09-29 20:14 /amazon-reviews-pds/parquet/product_category=Books
drwxr-xr-x - hadoop hadoop 0 2018-09-29 20:11 /amazon-reviews-pds/parquet/product_category=Camera
drwxr-xr-x - hadoop hadoop 0 2018-09-29 20:13 /amazon-reviews-pds/parquet/product_category=Digital_Ebook_Purchase
drwxr-xr-x - hadoop hadoop 0 2018-09-29 20:15 /amazon-reviews-pds/parquet/product_category=Digital_Music_Purchase
drwxr-xr-x - hadoop hadoop 0 2018-09-29 20:15 /amazon-reviews-pds/parquet/product_category=Digital_Software
$ hive
hive> CREATE EXTERNAL TABLE parquet.amazon_reviews_parquet(
marketplace string,
customer_id string,
review_id string,
product_id string,
product_parent string,
product_title string,
star_rating int,
helpful_votes int,
total_votes int,
vine string,
verified_purchase string,
review_headline string,
review_body string,
review_date bigint,
year int)
PARTITIONED BY (product_category string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
'hdfs:///amazon-reviews-pds/parquet';
hive> MSCK REPAIR TABLE parquet.amazon_reviews_parquet;
hive> quit
Presto からクエリを投げてみる
$ presto-cli
presto> use hive.parquet;
presto:parquet> select count(*) from amazon_reviews_parquet;
_col0
160796570
(1 row)
Query 20181008_103006_00003_ypxzu, FINISHED, 2 nodes
Splits: 1,126 total, 1,126 done (100.00%)
0:03 [161M rows, 0B] [49.9M rows/s, 0B/s]
presto:parquet> select count(year) from amazon_reviews_parquet;
_col0
160796570
(1 row)
Query 20181008_103037_00004_ypxzu, FINISHED, 2 nodes
Splits: 1,132 total, 1,132 done (100.00%)
0:05 [161M rows, 2.58MB] [31.2M rows/s, 513KB/s]
presto:parquet> select count(review_body) from amazon_reviews_parquet;
_col0
160789772
(1 row)
Query 20181008_103111_00005_ypxzu, FINISHED, 2 nodes
Splits: 1,137 total, 1,137 done (100.00%)
3:58 [161M rows, 34GB] [675K rows/s, 146MB/s]
インストール
- sysdig をインストールする。
- iotop をインストールする。
$ sudo yum -y install iotop
$ sudo yum -y install htop
データ
$ hadoop fs -ls -h -R /amazon-reviews-pds/parquet/|head -50
drwxr-xr-x - hadoop hadoop 0 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Apparel
-rw-r--r-- 1 hadoop hadoop 115.0 M 2018-09-29 20:11 /amazon-reviews-pds/parquet/product_category=Apparel/part-00000-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 115.3 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Apparel/part-00001-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 114.9 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Apparel/part-00002-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 115.2 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Apparel/part-00003-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 115.3 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Apparel/part-00004-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 115.3 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Apparel/part-00005-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 115.4 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Apparel/part-00006-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 114.8 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Apparel/part-00007-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 115.3 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Apparel/part-00008-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 115.3 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Apparel/part-00009-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
drwxr-xr-x - hadoop hadoop 0 2018-09-29 20:13 /amazon-reviews-pds/parquet/product_category=Automotive
-rw-r--r-- 1 hadoop hadoop 80.8 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Automotive/part-00000-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 81.2 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Automotive/part-00001-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 80.9 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Automotive/part-00002-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 81.1 M 2018-09-29 20:13 /amazon-reviews-pds/parquet/product_category=Automotive/part-00003-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 81.1 M 2018-09-29 20:13 /amazon-reviews-pds/parquet/product_category=Automotive/part-00004-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 80.8 M 2018-09-29 20:13 /amazon-reviews-pds/parquet/product_category=Automotive/part-00005-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 81.1 M 2018-09-29 20:13 /amazon-reviews-pds/parquet/product_category=Automotive/part-00006-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 80.6 M 2018-09-29 20:13 /amazon-reviews-pds/parquet/product_category=Automotive/part-00007-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 80.9 M 2018-09-29 20:13 /amazon-reviews-pds/parquet/product_category=Automotive/part-00008-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 81.3 M 2018-09-29 20:13 /amazon-reviews-pds/parquet/product_category=Automotive/part-00009-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
drwxr-xr-x - hadoop hadoop 0 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Baby
-rw-r--r-- 1 hadoop hadoop 48.9 M 2018-09-29 20:11 /amazon-reviews-pds/parquet/product_category=Baby/part-00000-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 49.5 M 2018-09-29 20:11 /amazon-reviews-pds/parquet/product_category=Baby/part-00001-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 49.3 M 2018-09-29 20:11 /amazon-reviews-pds/parquet/product_category=Baby/part-00002-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 49.0 M 2018-09-29 20:11 /amazon-reviews-pds/parquet/product_category=Baby/part-00003-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 49.1 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Baby/part-00004-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 49.1 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Baby/part-00005-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 49.3 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Baby/part-00006-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 49.0 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Baby/part-00007-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 49.0 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Baby/part-00008-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 48.9 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Baby/part-00009-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
drwxr-xr-x - hadoop hadoop 0 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Beauty
-rw-r--r-- 1 hadoop hadoop 127.1 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Beauty/part-00000-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 127.3 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Beauty/part-00001-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 127.2 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Beauty/part-00002-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 126.9 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Beauty/part-00003-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 127.0 M 2018-09-29 20:12 /amazon-reviews-pds/parquet/product_category=Beauty/part-00004-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 127.0 M 2018-09-29 20:13 /amazon-reviews-pds/parquet/product_category=Beauty/part-00005-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 126.8 M 2018-09-29 20:13 /amazon-reviews-pds/parquet/product_category=Beauty/part-00006-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 127.0 M 2018-09-29 20:13 /amazon-reviews-pds/parquet/product_category=Beauty/part-00007-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 127.4 M 2018-09-29 20:13 /amazon-reviews-pds/parquet/product_category=Beauty/part-00008-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 127.5 M 2018-09-29 20:13 /amazon-reviews-pds/parquet/product_category=Beauty/part-00009-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
drwxr-xr-x - hadoop hadoop 0 2018-09-29 20:14 /amazon-reviews-pds/parquet/product_category=Books
-rw-r--r-- 1 hadoop hadoop 1.0 G 2018-09-29 20:14 /amazon-reviews-pds/parquet/product_category=Books/part-00000-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 1.0 G 2018-09-29 20:14 /amazon-reviews-pds/parquet/product_category=Books/part-00001-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 1.0 G 2018-09-29 20:15 /amazon-reviews-pds/parquet/product_category=Books/part-00002-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 1.0 G 2018-09-29 20:15 /amazon-reviews-pds/parquet/product_category=Books/part-00003-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
-rw-r--r-- 1 hadoop hadoop 1.0 G 2018-09-29 20:15 /amazon-reviews-pds/parquet/product_category=Books/part-00004-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet