[关闭]
@tsing1226 2015-12-30T16:49:21.000000Z 字数 1940 阅读 951

hive

Hive案例分析与实践

 引出问题

创建数据库

create database if not exists db_1205 ;
create table IF NOT EXISTS db_1205.bf_log_src (
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_sent string,
request_body string,
http_referer string,
http_user_agent string,
http_x_forwarded_for string,
host string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '
stored as textfile ;

加载数据

load data local inpath '/opt/datas/moodle.ibeifeng.access.log' into table db_1205.bf_log_src ;

查看数据的条数

select  count(1) from db_1205.bf_log_src ; 

查看表格数据

select * from db_1205.bf_log_src limit 2 ;

出现问题

日志文件中的每条数据中,存在空格,导致加载数据有问题
对原数据进行预处理

思路

* 原表
    原数据,文件内容不变
* 针对不同的月创建不同的子表
    * 数据文件存储格式
        orcfile/parquet
    * 数据文件压缩
        snappy
    * map output 
        中间结果数据是否压缩snappy
    * 外部表
    * 分区表

验证正则表达式正确的网址:http://tool.chinaz.com/regex

创建数据库

drop table IF EXISTS db_1205.bf_log_src ;

创建表格

create table db_1205.bf_log_src (
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_sent string,
request_body string,
http_referer string,
http_user_agent string,
http_x_forwarded_for string,
host string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
  "input.regex" = "(\"[^ ]*\") (\"-|[^ ]*\") (\"[^\]]*\") (\"[^\"]*\") (\"[0-9]*\") (\"[0-9]*\") (-|[^ ]*) (\"[^ ]*\") (\"[^\"]*\") (-|[^ ]*) (\"[^ ]*\")"
)
stored as textfile ;

正则表达式匹配的结果

加载数据

load data local inpath '/opt/datas/moodle.ibeifeng.access.log' into table db_1205.bf_log_src ;

查看表格数据

select * from db_1205.bf_log_src limit 2 ;

创建表格以orc存储,并以snappy压缩

方式1

drop table if exists db_1205.df_log_comm ;
create table IF NOT EXISTS db_1205.df_log_comm (
remote_addr string,
time_local string,
request string,
http_referer string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
stored as orc tblproperties ("orc.compress"="SNAPPY");

方式2

create table IF NOT EXISTS db_1205.df_log_comm2
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
stored as orc tblproperties ("orc.compress"="SNAPPY")
AS select remote_addr, time_local, request, http_referer from db_1205.bf_log_src ;

加载数据:

insert into table db_1205.df_log_comm
select remote_addr, time_local, request, http_referer from db_1205.bf_log_src ;

查询表格数据

添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注