@tsing1226
        
        2015-12-11T09:04:55.000000Z
        字数 7124
        阅读 1464
    分析数据的来源?
    * RDBMS(Oracle,MYSQL,DB2等等),将数据迁移到HDFS 
        ->  SQOOP(SQL to HADOOP)
        Kettle
    * 日志文件,如何迁移到HDFS
        -> Flume
        Kafka
* MapReduce Job比较多
    企业定时调度
* 业务,需要多个MapReduce Job才能完成。
* 递归学习
    job1-job2-job3
    依赖调度的实现
Oozie:工作流协作框架,基于时间和数据处理
Azkaban
HDFS(50070)\YARN(8088)\MapReduce(19888)\Hive(shell)\sqoop\Flume\Oozie
有没有一个框架或者工具,将上面所有的功能集成到一个地方呢?比如说集成在一个WEB UI界面呢?
Cloudera贡献的一个框架 Hue
Cloudera 软件下载: 
http://archive.cloudera.com/cdh5/
hadoo-2.5.0-chd5.3.6
    `http://archive.cloudera.com/cdh5/cdh/5/hadoop-2.5.0-cdh5.3.6.tar.gz`
hive-0.13.1-cdh5.3.6
`http://archive.cloudera.com/cdh5/cdh/5/hive-0.13.1-cdh5.3.6.tar.gz`
zookeeper-3.4.5-cdh5.3.6
`http://archive.cloudera.com/cdh5/cdh/5/zookeeper-3.4.5-cdh5.3.6.tar.gz`
sqoop-1.4.5-cdh5.3.6
http://archive.cloudera.com/cdh5/cdh/5/sqoop-1.4.5-cdh5.3.6/
tar -zxf sqoop-1.4.5-cdh5.3.6.tar.gz -C /opt/cdh3.5.6/
#Set path to where bin/hadoop is available
export HADOOP_COMMON_HOME=/opt/cdh3.5.6/hadoop-2.5.0-cdh5.3.6
#Set path to where hadoop-*-core.jar is available
export HADOOP_MAPRED_HOME=/opt/cdh3.5.6/hadoop-2.5.0-cdh5.3.6
#Set the path to where bin/hive is available
export HIVE_HOME=/opt/cdh3.5.6/hive-0.13.1-cdh5.3.6
#Set the path for where zookeper config dir is
export ZOOCFGDIR=/opt/cdh3.5.6/zookeeper-3.4.5-cdh5.3.6/conf
ZOOKEEPER_HOME=/opt/cdh3.5.6/zookeeper-3.4.5-cdh5.3.6
cp -r mysql-connector-java-5.1.27/mysql-connector-java-5.1.27-bin.jar /opt/cdh3.5.6/sqoop-1.4.5-cdh5.3.6/lib/
bin/sqoop list-databases \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306 \
--username root \
--password 123456

sqoop 
    就是将常用的MapReduce(与RDBMS系统交互的数据导入导出)进行封装,通过命令行传递参数的形式,运行MapReduce任务。 
    Map Task(并行,批量执行)
Sqoop 依赖于HADOOP  
    * 数据,存储的一方,存储在HDFS 
    * 底层的数据传输的实现MR / YARN
sqoop导入数据流程图 
    
CREATE TABLE `my_user` (
  `id` tinyint(4) NOT NULL AUTO_INCREMENT,
  `account` varchar(255) DEFAULT NULL,
  `passwd` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`)
);
INSERT INTO `my_user` VALUES ('1', 'admin', 'admin');
INSERT INTO `my_user` VALUES ('2', 'pu', '12345');
INSERT INTO `my_user` VALUES ('3', 'system', 'system');
INSERT INTO `my_user` VALUES ('4', 'zxh', 'zxh');
INSERT INTO `my_user` VALUES ('5', 'test', 'test');
INSERT INTO `my_user` VALUES ('6', 'pudong', 'pudong');
INSERT INTO `my_user` VALUES ('7', 'qiqi', 'qiqi');
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
--password 123456 \
--table  my_user

bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
--password 123456 \
--table  my_user \
--num-mappers 1 \
--target-dir /user/grc/sqoop/import/imp_my_user 
 

bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
--password 123456 \
--table  my_user \
--num-mappers 3 \
--target-dir /user/grc/sqoop/import/imp_my_user \
--delete-target-dir 
15/11/18 03:09:25 WARN manager.MySQLManager: It looks like you are importing from mysql.
15/11/18 03:09:25 WARN manager.MySQLManager: This transfer can be faster! Use the --direct 15/11/18 03:09:25 WARN manager.MySQLManager: option to exercise a MySQL-specific fast path.
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root  \
-P \
--table  my_user \
--num-mappers 1 \
--target-dir /user/grc/sqoop/import/imp_my_user \
--delete-target-dir \
--direct
bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root  \
-P \
--table  my_user \
--num-mappers 1 \
--target-dir /user/grc/sqoop/import/imp_my_user \
--delete-target-dir \
--direct  \
--fields-terminated-by "\t"

bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root  \
-P \
--table  my_user \
--num-mappers 1 \
--target-dir /user/grc/sqoop/import/imp_my_user \
--direct  \
--fields-terminated-by "\t" \
--check-column id \
--incremental append \
--last-value 4

bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root  \
-P \
--table  my_user \
--num-mappers 1 \
--target-dir /user/grc/sqoop/import/imp_my_user \
--delete-target-dir \
--direct  \
--fields-terminated-by "\t" \
--as-parquetfile

bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root  \
-P \
--table  my_user \
--num-mappers 1 \
--target-dir /user/grc/sqoop/import/imp_my_user \
--delete-target-dir \
--fields-terminated-by "\t" \
--as-parquetfile

bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root  \
-P \
--table  my_user \
--num-mappers 1 \
--target-dir /user/grc/sqoop/import/imp_my_user_snappy \
--delete-target-dir \
--direct  \
--fields-terminated-by "\t" \
--compress \
--compression-codec org.apache.hadoop.io.compress.SnappyCodec

bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
--password 123456 \
--table  my_user \
--num-mappers 1 \
--columns  id \
--target-dir /user/grc/sqoop/import/imp_my_user_column

bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
--password 123456 \
--table  my_user \
--num-mappers 1 \
--columns id,passwd  \
--target-dir /user/grc/sqoop/import/imp_my_user_column2

bin/sqoop import \ 
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \ 
--username root \ 
--password 123456 \ 
--num-mappers 1 \ 
--query 'select e.account from my_user e WHERE $CONDITIONS' \ 
--target-dir /user/grc/sqoop/import/imp_my_user5

 bin/sqoop import \
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \
--username root \
--password 123456 \
--num-mappers 1 \
--query 'select e.account from my_user e WHERE id > 4 and $CONDITIONS' \
--target-dir /user/grc/sqoop/import/imp_my_user3 
 
sqoop导出数据图 

CREATE TABLE my_user2 ( 
  id tinyint(4) NOT NULL AUTO_INCREMENT, 
  account varchar(255) DEFAULT NULL, 
  passwd varchar(255) DEFAULT NULL, 
  PRIMARY KEY (id) 
);
bin/sqoop export \ 
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \ 
--username root \ 
--password 123456 \ 
--table my_user2 \ 
--num-mappers 1 \ 
--input-fields-terminated-by "\t" \ 
--export-dir /user/grc/sqoop/import/imp_my_user 

CREATE TABLE my_user3 ( 
  id tinyint(4) NOT NULL AUTO_INCREMENT, 
  account varchar(255) DEFAULT NULL, 
  passwd varchar(255) DEFAULT NULL, 
  PRIMARY KEY (id) 
); 
bin/sqoop export \ 
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \ 
--username root \ 
--password 123456 \ 
--table my_user3 \ 
--num-mappers 1 \ 
--batch \ 
--input-fields-terminated-by "\t" \ 
--export-dir /user/grc/sqoop/import/imp_my_user 

drop table if exists db_1206.h_user ; 
create table db_1206.h_user( 
id int, 
account string, 
password string 
) 
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' ;
bin/sqoop import \ 
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \ 
--username root \ 
--password 123456 \ 
--table my_user \ 
--num-mappers 1 \ 
--fields-terminated-by "\t" \ 
--delete-target-dir \ 
--hive-database db_1206 \ 
--hive-import \ 
--hive-table h_user

CREATE TABLE user_export ( 
  id tinyint(4) NOT NULL AUTO_INCREMENT, 
  account varchar(255) DEFAULT NULL, 
  passwd varchar(255) DEFAULT NULL, 
  PRIMARY KEY (id) 
);
bin/sqoop export \ 
--connect jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 \ 
--username root \ 
--password 123456 \ 
--table user_export \ 
--num-mappers 1 \ 
--input-fields-terminated-by "\t" \ 
--export-dir /user/hive/warehouse/db_1206.db/h_user

import 
--connect 
jdbc:mysql://hadoop-senior01.grc.com:3306/db_1206 
--username
root
--password
123456
--table
my_user
--num-mappers
1
--target-dir
/user/hive/warehouse/db_1206.db/my_user
参考地址:http://archive.cloudera.com/cdh5/cdh/5/sqoop-1.4.5-cdh5.3.6/SqoopUserGuide.html