分区表
create external table track_info(
ip string,
country string,
province string,
city string,
url string,
time string,
page string
) partitioned by (day string)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
location '/project/trackinfo/';
crontab表达式进行调度
Azkaban调度:ETLApp==》其他的统计分析
之后将清洗过后的数据加载进去分区表
LOAD DATA INPATH 'hdfs://hadoop000:8020/project/input/etl' OVERWRITE INTO TABLE track_info partition(day='2013-07-21') ;
select count(*) from track_info where day='2013-07-21';
统计省份
select province,count(*) from track_info where day='2013-07-21' group by province;
省份统计表
create external table track_info_province_stat(
province string,
cnt bigint
) partitioned by (day string)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
insert overwrite table track_info_province_stat partition(day='2013-07-21')
select province,count(*) as cnt from track_info where day='2013-07-21' group by province;
统计的数据已经在Hive表track_info_province_stat而且这个表是一个分区表,后续统计报表的数据可以直接从这个表中查询,也可以将hive表中的数据导出到关系型数据库中去(sqoop)
第一步:etl
第二步:把elt中输出中的数据加载到track_info分区表中去
第三步:各个维度统计结果的数据输出到各自维度的表里(track_info_province_stat)
第四步:将数据导出(optional)