@songlaf
2016-06-02T15:38:10.000000Z
字数 1886
阅读 617
北风网大数据培训
需求:需要统计网站的访问信息date,UV,PV,登录人数,游客人数,平均访问时长,二跳率(访问网页两次以上), 独立IP。
首先对原始数据根据sessionid进行统计,统计到下表的数据,然后再对这张表进行统计。
字段名 | 说明 | 备注 |
---|---|---|
会话ID | sessionid | |
guid | 访客的ID | |
trackerU | 渠道ID | 访问网站的途径 |
landing_url | 访问网站的第一个网页 | |
landing_url | 着陆之前的网页 | |
user_id | 用户id | |
pv | 网站pv | |
stay_time | 停留时间 | 用服务器记录的时间的最大值与最小值之间的差 |
min_tracktime | 最小时间 | |
ip | 省份时间 | |
date | 日期 | 分区 |
#建立目标数据
create table session_info(
session_id string,
guid string,
trackerU string,
landing_url string,
landing_url_ref string,
user_id string,
pv string,
stay_time string,
min_trackTime string,
ip string,
provinceId string
)
partitioned by (date string)
row format delimited fields terminated by '\t' ;
create table session_info_tmp1 as select
sessionId session_id,
max(guid) guid,
--trackerU string,
--landing_url string,
--landing_url_ref string,
max(endUserId) user_id,
count(url) pv,
(max(unix_timestamp(trackTime)) - min(unix_timestamp(trackTime))) stay_time,
min(trackTime) min_trackTime ,
max(ip) ip,
max(provinceId) provinceId
from track_log where date='20150828'
group by sessionId ;
create table session_info_tmp2 as select
sessionId session_id,
trackTime trackTime,
trackeru trackerU,
url landing_url,
referer landing_url_ref
from track_log where date='20150828' ;
````
<div class="md-section-divider"></div>
##2.4)汇总数据
<div class="md-section-divider"></div>
```sql
insert overwrite table session_info partition (date='20150828')
select
a.session_id,
a.guid,
b.trackerU,
b.landing_url,
b.landing_url_ref,
a.user_id,
a.pv,
a.stay_time,
a.min_trackTime,
a.ip,
a.provinceId
from session_info_tmp1 a join session_info_tmp2 b
on a.session_id=b.session_id
and a.min_trackTime=b.trackTime ;
create table vistor_users_info as
select
date,
count(distinct guid) UV,
sum(pv) PV,
count(case when user_id != '' then user_id else null end) login_users,
count(case when user_id = '' then user_id else null end) vistor_users,
avg(stay_time) avg_stay_time,
count(case when pv>=2 then session_id else null end)/count(session_id) sec_ratio,
count(distinct ip) ip
from session_info where date='20150828'
group by date ;