@tsing1226
2015-12-30T16:49:21.000000Z
字数 1940
阅读 951
hive
create database if not exists db_1205 ;
create table IF NOT EXISTS db_1205.bf_log_src (
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_sent string,
request_body string,
http_referer string,
http_user_agent string,
http_x_forwarded_for string,
host string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '
stored as textfile ;
load data local inpath '/opt/datas/moodle.ibeifeng.access.log' into table db_1205.bf_log_src ;
select count(1) from db_1205.bf_log_src ;
select * from db_1205.bf_log_src limit 2 ;
日志文件中的每条数据中,存在空格,导致加载数据有问题
对原数据进行预处理
* 原表
原数据,文件内容不变
* 针对不同的月创建不同的子表
* 数据文件存储格式
orcfile/parquet
* 数据文件压缩
snappy
* map output
中间结果数据是否压缩snappy
* 外部表
* 分区表
验证正则表达式正确的网址:http://tool.chinaz.com/regex
drop table IF EXISTS db_1205.bf_log_src ;
create table db_1205.bf_log_src (
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_sent string,
request_body string,
http_referer string,
http_user_agent string,
http_x_forwarded_for string,
host string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "(\"[^ ]*\") (\"-|[^ ]*\") (\"[^\]]*\") (\"[^\"]*\") (\"[0-9]*\") (\"[0-9]*\") (-|[^ ]*) (\"[^ ]*\") (\"[^\"]*\") (-|[^ ]*) (\"[^ ]*\")"
)
stored as textfile ;
load data local inpath '/opt/datas/moodle.ibeifeng.access.log' into table db_1205.bf_log_src ;
select * from db_1205.bf_log_src limit 2 ;
方式1
drop table if exists db_1205.df_log_comm ;
create table IF NOT EXISTS db_1205.df_log_comm (
remote_addr string,
time_local string,
request string,
http_referer string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
stored as orc tblproperties ("orc.compress"="SNAPPY");
方式2
create table IF NOT EXISTS db_1205.df_log_comm2 ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' stored as orc tblproperties ("orc.compress"="SNAPPY") AS select remote_addr, time_local, request, http_referer from db_1205.bf_log_src ;
insert into table db_1205.df_log_comm
select remote_addr, time_local, request, http_referer from db_1205.bf_log_src ;