@tsing1226
2016-01-02T14:45:32.000000Z
字数 4643
阅读 1678
hive
需求分析:统计分析每天每时段PV和UV
分析步骤:
1、创建hive表,表列分隔符需要和文件保持一致;
2、load数据到hive表;
3、写hive.sql,结果存在hive表2中;
4、从hive表2导出结果数据到mysql目的表,用sqoop后续报表从mysql导出。
drop database if exists db_track ;
create database db_track ;
drop table if exists db_track.track_log ;
create table db_track.track_log (
id string ,
url string ,
referer string ,
keyword string ,
type string ,
guid string ,
pageId string ,
moduleId string ,
linkId string ,
attachedInfo string ,
sessionId string ,
trackerU string ,
trackerType string ,
ip string ,
trackerSrc string ,
cookie string ,
orderCode string ,
trackTime string ,
endUserId string ,
firstLink string ,
sessionViewNo string ,
productId string ,
curMerchantId string ,
provinceId string ,
cityId string ,
fee string ,
edmActivity string ,
edmEmail string ,
edmJobId string ,
ieVersion string ,
platform string ,
internalKeyword string ,
resultSum string ,
currentPage string ,
linkPosition string ,
buttonPosition string
)
PARTITIONED BY (date string,hour string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' ;
load data local inpath '/opt/datas/2015082818' overwrite into table db_track.track_log partition(date = '20150828', hour = '18') ;
load data local inpath '/opt/datas/2015082819' overwrite into table db_track.track_log partition(date = '20150828', hour = '19') ;
查看分区
查询多少条数据
select count(1) from db_track.track_log where date='20150828' ;
统计每天每小时的pv和uv
select
date,hour,count(1) pv,count(distinct guid) uv
from db_track.track_log
where date ='20150828'
group by date ,hour ;
一般先在hive上跑,在导入结果表或其他操作
drop table if exists db_track.daily_hour_visit ;
create table db_track.daily_hour_visit (
date string ,
hour string ,
pv string ,
uv string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' ;
insert into table db_track.daily_hour_visit
select
date, hour, count(url) pv, count(distinct guid) uv
from db_track.track_log
where date = '20150828'
group by date, hour ;
创建数据库
drop database if exists db_track ;
创建表
create database db_track ;
use db_track ;
create table daily_hour_visit(
date varchar(255) not null,
hour varchar(255) not null,
pv varchar(255) ,
uv varchar(255) ,
primary key (date, hour)
);
bin/sqoop export \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_track \
--username root \
--password 123456 \
--table daily_hour_visit \
--num-mappers 1 \
--input-fields-terminated-by "\t" \
--export-dir /user/hive/warehouse/db_track.db/daily_hour_visit/
1、每天都需要对数据进行统计,统计的数据属于前一天,将分析结果导出到mysql数据库结果集中
2、如何实现日志数据放入到hive对应的表中
3、工作流
*分析数据
*导出结果到RDBMS表中
依据目录,确定加载日志数据;
依据日志文件名称,确定加载表的分区 ;
编写shell脚本,实现数据加载 ;
每天加载前一天的日志文件数据到表db_track.track_log
说明:数据日志文件,放在一台日志文件
每天日志文件放入一台同一目录
- 名称【年月日】组成,如【20151212】
日志文件数据为每小时产生一次
- 名称年月日时组成【2015122000】
#!/bin/sh
## set system enviroment variable
. /etc/profile
## set HIVE_HOME
HIVE_HOME=/opt/cdh3.5.6/hive-0.13.1-cdh5.3.6
## track logs directory
LOG_DIR=/opt/datas/track_logs
##get date
yesterday=`date -d -1days '+%Y%m%d'`
### echo $yesterday
##change directory
##cd $LOG_DIR/$yesterday
### echo $LOG_DIR/$yesterday
### echo $?
for line in `ls $yesterday`
do
### echo $line
#get date and hour
date=${line:0:4}${line:4:2}${line:6:2}
hour=${line:8:2}
### echo $date
### echo $hour
#load data into table
$HIVE_HOME/bin/hive -e "load data local inpath '$LOG_DIR/$yesterday/$line' overwrite into table db_track. track_log partition(date=$date,hour=$hour)"
done
#!/bin/sh
## set HIVE_HOME
HIVE_HOME=/opt/cdh3.5.6/hive-0.13.1-cdh5.3.6
##set SQOOP_HOME
SQOOP_HOME=/opt/cdh3.5.6/sqoop-1.4.5-cdh5.3.6
##set script path
SCRIPT_PATH=/opt/datas/track_logs
##get date
yesterday=`date -d -1days '+%Y%m%d'`
for line in `ls $yesterday`
do
### echo $line
#get date and hour
date=${line:0:4}${line:4:2}${line:6:2}
done
echo " ########Hive load data into table result##########"
##data process by hive
$HIVE_HOME/bin/hive --hiveconf date=$date -f $SCRIPT_PATH/hive_visit.sql
##export data to mysql use sqoop
echo "#######export data to mysql use sqoop##########"
$SQOOP_HOME/bin/sqoop --options-file $SCRIPT_PATH/export_daily_visit.txt
--drop table db_track.daily_hour_visit
drop table if exists db_track.daily_hour_visit ;
-- create table db_track.daily_hour_visit
create table db_track.daily_hour_visit(
date string ,
hour string ,
pv string ,
uv string
)
row format delimited fields terminated by '\t' ;
insert into table db_track.daily_hour_visit
select
date ,hour,count(url) pv ,count(distinct guid) uv
from db_track.track_log
where date=${hiveconf:date}
group by date ,hour ;
export
--connect
jdbc:mysql://hadoop-senior01.grc.com:3306/db_track
--username
root
--password
123456
--table
daily_hour_visit
--num-mappers
1
--input-fields-terminated-by
"\t"
--export-dir
/user/hive/warehouse/db_track.db/daily_hour_visit
export_daily_visit.txt
问题:脚本写出后,只要运行脚本就可以完成上述需求,但是工作人员不能每天都操作,这也不现实,我们通过linux Crontab对脚本进行定时处理。
我们设计是每天晚上1点10分导入昨天的数据,然后2点10分将数据导入hive结果表,并导出到关系型数据库中。crontab -e
## load_track-log.sh
10 01 * * * /opt/datas/track_logs/load_track-log.sh
##load_track-log.sh
10 02 * * * /opt/datas/track_logs/daily_hour_visit.sh