@tsing1226
2015-12-11T17:04:55.000000Z
字数 7124
阅读 1292
分析数据的来源?
* RDBMS(Oracle,MYSQL,DB2等等),将数据迁移到HDFS
-> SQOOP(SQL to HADOOP)
Kettle
* 日志文件,如何迁移到HDFS
-> Flume
Kafka
* MapReduce Job比较多
企业定时调度
* 业务,需要多个MapReduce Job才能完成。
* 递归学习
job1-job2-job3
依赖调度的实现
Oozie:工作流协作框架,基于时间和数据处理
Azkaban
HDFS(50070)\YARN(8088)\MapReduce(19888)\Hive(shell)\sqoop\Flume\Oozie
有没有一个框架或者工具,将上面所有的功能集成到一个地方呢?比如说集成在一个WEB UI界面呢?
Cloudera贡献的一个框架 Hue
Cloudera 软件下载:
http://archive.cloudera.com/cdh5/
hadoo-2.5.0-chd5.3.6
`http://archive.cloudera.com/cdh5/cdh/5/hadoop-2.5.0-cdh5.3.6.tar.gz`
hive-0.13.1-cdh5.3.6
`http://archive.cloudera.com/cdh5/cdh/5/hive-0.13.1-cdh5.3.6.tar.gz`
zookeeper-3.4.5-cdh5.3.6
`http://archive.cloudera.com/cdh5/cdh/5/zookeeper-3.4.5-cdh5.3.6.tar.gz`
sqoop-1.4.5-cdh5.3.6
http://archive.cloudera.com/cdh5/cdh/5/sqoop-1.4.5-cdh5.3.6/
tar -zxf sqoop-1.4.5-cdh5.3.6.tar.gz -C /opt/cdh3.5.6/
#Set path to where bin/hadoop is available
export HADOOP_COMMON_HOME=/opt/cdh3.5.6/hadoop-2.5.0-cdh5.3.6
#Set path to where hadoop-*-core.jar is available
export HADOOP_MAPRED_HOME=/opt/cdh3.5.6/hadoop-2.5.0-cdh5.3.6
#Set the path to where bin/hive is available
export HIVE_HOME=/opt/cdh3.5.6/hive-0.13.1-cdh5.3.6
#Set the path for where zookeper config dir is
export ZOOCFGDIR=/opt/cdh3.5.6/zookeeper-3.4.5-cdh5.3.6/conf
ZOOKEEPER_HOME=/opt/cdh3.5.6/zookeeper-3.4.5-cdh5.3.6
cp -r mysql-connector-java-5.1.27/mysql-connector-java-5.1.27-bin.jar /opt/cdh3.5.6/sqoop-1.4.5-cdh5.3.6/lib/
bin/sqoop list-databases \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306 \
--username root \
--password 123456
sqoop
就是将常用的MapReduce(与RDBMS系统交互的数据导入导出)进行封装,通过命令行传递参数的形式,运行MapReduce任务。
Map Task(并行,批量执行)
Sqoop 依赖于HADOOP
* 数据,存储的一方,存储在HDFS
* 底层的数据传输的实现MR / YARN
sqoop导入数据流程图
CREATE TABLE `my_user` (
`id` tinyint(4) NOT NULL AUTO_INCREMENT,
`account` varchar(255) DEFAULT NULL,
`passwd` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
);
INSERT INTO `my_user` VALUES ('1', 'admin', 'admin');
INSERT INTO `my_user` VALUES ('2', 'pu', '12345');
INSERT INTO `my_user` VALUES ('3', 'system', 'system');
INSERT INTO `my_user` VALUES ('4', 'zxh', 'zxh');
INSERT INTO `my_user` VALUES ('5', 'test', 'test');
INSERT INTO `my_user` VALUES ('6', 'pudong', 'pudong');
INSERT INTO `my_user` VALUES ('7', 'qiqi', 'qiqi');
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
--password 123456 \
--table my_user
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
--password 123456 \
--table my_user \
--num-mappers 1 \
--target-dir /user/grc/sqoop/import/imp_my_user
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
--password 123456 \
--table my_user \
--num-mappers 3 \
--target-dir /user/grc/sqoop/import/imp_my_user \
--delete-target-dir
15/11/18 03:09:25 WARN manager.MySQLManager: It looks like you are importing from mysql.
15/11/18 03:09:25 WARN manager.MySQLManager: This transfer can be faster! Use the --direct 15/11/18 03:09:25 WARN manager.MySQLManager: option to exercise a MySQL-specific fast path.
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
-P \
--table my_user \
--num-mappers 1 \
--target-dir /user/grc/sqoop/import/imp_my_user \
--delete-target-dir \
--direct
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
-P \
--table my_user \
--num-mappers 1 \
--target-dir /user/grc/sqoop/import/imp_my_user \
--delete-target-dir \
--direct \
--fields-terminated-by "\t"
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
-P \
--table my_user \
--num-mappers 1 \
--target-dir /user/grc/sqoop/import/imp_my_user \
--direct \
--fields-terminated-by "\t" \
--check-column id \
--incremental append \
--last-value 4
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
-P \
--table my_user \
--num-mappers 1 \
--target-dir /user/grc/sqoop/import/imp_my_user \
--delete-target-dir \
--direct \
--fields-terminated-by "\t" \
--as-parquetfile
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
-P \
--table my_user \
--num-mappers 1 \
--target-dir /user/grc/sqoop/import/imp_my_user \
--delete-target-dir \
--fields-terminated-by "\t" \
--as-parquetfile
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
-P \
--table my_user \
--num-mappers 1 \
--target-dir /user/grc/sqoop/import/imp_my_user_snappy \
--delete-target-dir \
--direct \
--fields-terminated-by "\t" \
--compress \
--compression-codec org.apache.hadoop.io.compress.SnappyCodec
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
--password 123456 \
--table my_user \
--num-mappers 1 \
--columns id \
--target-dir /user/grc/sqoop/import/imp_my_user_column
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
--password 123456 \
--table my_user \
--num-mappers 1 \
--columns id,passwd \
--target-dir /user/grc/sqoop/import/imp_my_user_column2
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
--password 123456 \
--num-mappers 1 \
--query 'select e.account from my_user e WHERE $CONDITIONS' \
--target-dir /user/grc/sqoop/import/imp_my_user5
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
--password 123456 \
--num-mappers 1 \
--query 'select e.account from my_user e WHERE id > 4 and $CONDITIONS' \
--target-dir /user/grc/sqoop/import/imp_my_user3
sqoop导出数据图
CREATE TABLE my_user2
(
id
tinyint(4) NOT NULL AUTO_INCREMENT,
account
varchar(255) DEFAULT NULL,
passwd
varchar(255) DEFAULT NULL,
PRIMARY KEY (id
)
);
bin/sqoop export \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
--password 123456 \
--table my_user2 \
--num-mappers 1 \
--input-fields-terminated-by "\t" \
--export-dir /user/grc/sqoop/import/imp_my_user
CREATE TABLE my_user3
(
id
tinyint(4) NOT NULL AUTO_INCREMENT,
account
varchar(255) DEFAULT NULL,
passwd
varchar(255) DEFAULT NULL,
PRIMARY KEY (id
)
);
bin/sqoop export \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
--password 123456 \
--table my_user3 \
--num-mappers 1 \
--batch \
--input-fields-terminated-by "\t" \
--export-dir /user/grc/sqoop/import/imp_my_user
drop table if exists db_1206.h_user ;
create table db_1206.h_user(
id int,
account string,
password string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' ;
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
--password 123456 \
--table my_user \
--num-mappers 1 \
--fields-terminated-by "\t" \
--delete-target-dir \
--hive-database db_1206 \
--hive-import \
--hive-table h_user
CREATE TABLE user_export
(
id
tinyint(4) NOT NULL AUTO_INCREMENT,
account
varchar(255) DEFAULT NULL,
passwd
varchar(255) DEFAULT NULL,
PRIMARY KEY (id
)
);
bin/sqoop export \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
--password 123456 \
--table user_export \
--num-mappers 1 \
--input-fields-terminated-by "\t" \
--export-dir /user/hive/warehouse/db_1206.db/h_user
import
--connect
jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206
--username
root
--password
123456
--table
my_user
--num-mappers
1
--target-dir
/user/hive/warehouse/db_1206.db/my_user
参考地址:http://archive.cloudera.com/cdh5/cdh/5/sqoop-1.4.5-cdh5.3.6/SqoopUserGuide.html