ablog

不器用で落着きのない技術者のメモ

IIS のアクセスログを Amazon Athena でクエリする

#Fields: date time s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs(User-Agent) sc-status sc-substatus sc-win32-status time-taken
  • テーブルを作成する。
CREATE EXTERNAL TABLE `iis_w3c_logs`( 
  date_col string, 
  time_col string, 
  s_ip string,
  cs_method string, 
  cs_uri_stem string, 
  cs_uri_query string,
  s_port string,
  cs_username string,
  c_ip string,
  user_agent string,
  sc_status string,
  sc_substatus string,
  sc_win32_status string,
  time_taken string
  ) 
ROW FORMAT DELIMITED  
  FIELDS TERMINATED BY ' '  
STORED AS INPUTFORMAT  
  'org.apache.hadoop.mapred.TextInputFormat'  
OUTPUTFORMAT  
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' 
LOCATION   's3://iis-log/w3c-log/'
  • クエリを実行してみる。
select cs_uri_stem, count(1) as cnt 
FROM "default"."iis_w3c_logs" 
group by cs_uri_stem 
order by cnt desc;
  • Parquet に CTAS する
CREATE TABLE iis_w3c_logs_parquet 
WITH (
     format = 'PARQUET', 
     external_location = 's3://iis-log/accesslog_parquet/'
     ) 
AS select *
FROM iis_w3c_logs;