当前位置: 开发笔记 > 编程语言 > 正文

sqoop基本配置与导入导出学习笔记(Draft)

作者：小新没蜡笔775 | 来源：互联网 | 2023-05-28 16:44

注，文档由word粘贴过后格式有错乱,图片也无法显示，稍后会进行调整，并将代码用代码格式显示。一.Sqoop介绍及安装http:sqoop

注&＃xff0c;文档由word粘贴过后格式有错乱,图片也无法显示&＃xff0c;稍后会进行调整&＃xff0c;并将代码用代码格式显示。

一.Sqoop介绍及安装

http://sqoop.apache.org/

文档&＃xff1a;http://sqoop.apache.org/docs/1.4.6/index.html

https://dev.mysql.com/downloads/mysql/

解压后的环境变量配置&＃xff1a;

vi /etc/profile

#set sqoop home

SQOOP_HOME&＃61;/sqoop-1.4.6

PATH&＃61;$SQOOP_HOME/bin:$PATH

export SQOOP_HOME PATH

验证版本&＃xff0c;暂时忽略其他因组件不存在而抛出的警告

#sqoop version

与往常一样&＃xff0c;要使sqoop连接到不同的数据库&＃xff0c;则需要先加载对应数据库的jar包才行。

放在$SQOOP_HOME/lib中

Mysql&＃xff1a;

[root&＃64;hadoop1setup]# rpm -vih --nodeps mysql-community-libs-compat-5.7.17-1.el5.x86_64.rpm

[root&＃64;hadoop1setup]# rpm -vih --nodeps mysql-community-libs-5.7.17-1.el5.x86_64.rpm

[root&＃64;hadoop1setup]# rpm -vih mysql-community-client-5.7.17-1.el5.x86_64.rpm

[root&＃64;hadoop1setup]# rpm -vih mysql-5.7.17-1.el5.x86_64.rpm

[root&＃64;hadoop1setup]# rpm -vih mysql-community-common-5.7.17-1.el5.x86_64.rpm

[root&＃64;hadoop1setup]# rpm -vih mysql-community-server-5.7.17-1.el5.x86_64.rpm

/etc/rc.d/init.d/mysqldstart|status

servicemysqld start

[root&＃64;hadoop1setup]# service mysqld stop

[root&＃64;hadoop1setup]# mysqld_safe --skip-grant-tables

[root&＃64;hadoop1~]# mysql

mysql>use mysql;

mysql>update user set authentication_string&＃61;password(&＃39;root&＃39;) where user&＃61;&＃39;root&＃39;;

QueryOK, 0 rows affected, 1 warning (0.00 sec)

Rowsmatched: 1 Changed: 0 Warnings: 1

mysql>flush privileges;

QueryOK, 0 rows affected (0.00 sec)

mysql>quit

[root&＃64;hadoop1~]# mysql -u root -p

mysql>set password&＃61;password(&＃39;Gopro5hero!&＃39;);

QueryOK, 0 rows affected, 1 warning (0.00 sec)

mysql>show databases;

&＃43;--------------------&＃43;

|Database |

&＃43;--------------------&＃43;

|information_schema |

|mysql |

|performance_schema |

|sys |

&＃43;--------------------&＃43;

4rows in set (0.00 sec)

mysql>select user,host from user;

&＃43;-----------&＃43;-----------&＃43;

|user | host |

&＃43;-----------&＃43;-----------&＃43;

|Bond | localhost |

|mysql.sys | localhost |

|root | localhost |

&＃43;-----------&＃43;-----------&＃43;

3rows in set (0.00 sec)

mysql>update user set host&＃61;&＃39;%&＃39; where user&＃61;&＃39;root&＃39;;

QueryOK, 1 row affected (0.00 sec)

Rowsmatched: 1 Changed: 1 Warnings: 0

mysql>flush privileges;

QueryOK, 0 rows affected (0.00 sec)

mysql>select user,host from user;

&＃43;-----------&＃43;-----------&＃43;

|user | host |

&＃43;-----------&＃43;-----------&＃43;

|root | % |

|Bond | localhost |

|mysql.sys | localhost |

&＃43;-----------&＃43;-----------&＃43;

3rows in set (0.00 sec)

mysql>CREATE USER &＃39;Bond&＃39;&＃64;&＃39;localhost&＃39; IDENTIFIED BY &＃39;test123&＃39;;

ERROR1819 (HY000): Your password does not satisfy the current policy requirements

mysql>CREATE USER &＃39;Bond&＃39;&＃64;&＃39;localhost&＃39; IDENTIFIED BY &＃39;Gopro5hero!&＃39;;

QueryOK, 0 rows affected (0.00 sec)

mysql>select host,user,authentication_string,password_expired,account_locked fromuser;

&＃43;-----------&＃43;-----------&＃43;-------------------------------------------&＃43;------------------&＃43;----------------&＃43;

&＃43;-----------&＃43;-----------&＃43;-------------------------------------------&＃43;------------------&＃43;----------------&＃43;

&＃43;-----------&＃43;-----------&＃43;-------------------------------------------&＃43;------------------&＃43;----------------&＃43;

3rows in set (0.00 sec)3 rows in set (0.00 sec)3 rows in set (0.00 sec)

mysql>delete from user where user &＃61; &＃39;Bond&＃39;;

QueryOK, 1 row affected (0.00 sec)

mysql>CREATE USER &＃39;Bond&＃39;&＃64;&＃39;localhost&＃39; IDENTIFIED BY &＃39;Gopro5hero!&＃39;;

ERROR1396 (HY000): Operation CREATE USER failed for &＃39;Bond&＃39;&＃64;&＃39;localhost&＃39;

mysql>flush privileges;

QueryOK, 0 rows affected (0.00 sec)

mysql>CREATE USER &＃39;Bond&＃39;&＃64;&＃39;localhost&＃39; IDENTIFIED BY &＃39;Gopro5hero!&＃39;;

QueryOK, 0 rows affected (0.00 sec)

mysql>alter user Bond&＃64;localhost identified by &＃39;Gopro5hero!!&＃39;;

QueryOK, 0 rows affected (0.01 sec)

mysql>flush privileges;

QueryOK, 0 rows affected (0.01 sec)

mkdir–p /usr/local/mysql/data/

chown-R mysql:mysql mysql/

/usr/local/mysql/bin/mysqld-u mysql --datadir&＃61;/usr/local/mysql/data

cpsupport-files/mysql.server /etc/init.d/mysql

二.Sqoop基本使用

以行为读取单位&＃xff0c;并行读取导入。每一行的列之间&＃xff0c;有分隔符。

Sqoop就是个ETL工具

Sqoop help import

三.Sqoop导入实战&＃xff08;mysql&＃xff09;

Mysql导入到hadoop 的hdfs

详细操作命令手册&＃xff1a;http://sqoop.apache.org/docs/1.4.6/SqoopUserGuide.html

sqoopimport --connect jdbc:mysql://hadoop1:3306/pjh -username root -password root--table t1

sqoopimport --connect jdbc:mysql://hadoop1:3306/pjh -username root -P --table t1

[root&＃64;hadoop1~]# hadoop fs -ls /user/root/t1

[root&＃64;hadoop1~]# hadoop fs -rm -R /user/root/t1

HIVE数据库回头再看&＃xff0c;目前只看hdfs

sqoopeval --connect jdbc:mysql://hadoop1:3306/pjh \

-username root -P \

--query "select *from t1 where id between 2 and 4"

#使用warehouse-dir将mysql表t1导入到hdfs指定的父目录中

[root&＃64;hadoop1 ~]#hadoop fs -mkdir -p /user/shared

sqoopimport \

--connectjdbc:mysql://hadoop1:3306/pjh -username root -P \

--tablet1 \

--warehouse-dir/user/shared \

--m1 \

--split-by id

#No primary key could be found for table t1. Please specify onewith --split-by or perform a sequential import with &＃39;--m 1&＃39;

#--m 1 时&＃xff0c;表示取消并行&＃xff0c;则仅会生成一个文件

#如果使用--split-by id&＃xff0c;则select列表中至少要包含id

#如果使用--split-by id&＃xff0c;而不指定--m&＃xff0c;则系统自动决定生成文件的数量。

#使用table, columns, where,target-dir 将指定的记录导入到指定目录.

sqoop import \

--connectjdbc:mysql://hadoop1:3306/pjh -username root -P \

--table t1 \

--columns "id,name"\

--where "idbetween &＃39;2&＃39; and &＃39;4&＃39;" \

--target-dir /user/shared/t1_q1\

--m 2 \

--split-by id

#如果使用--m 5 (--num-mappers大于1)做parallel query&＃xff0c;则必须同时指定--split-by

#如果结果集行数小于--m 5,则仅会生成一行一文件。

#--append默认追加全部记录&＃xff0c;新增文件至hdfs&＃xff0c;且不会更改原有文件

sqoopimport \

--connect jdbc:mysql://hadoop1:3306/pjh -username root -P \

--query "select id,name from t1 where id<9 and\$CONDITIONS" \

--target-dir /user/shared/t1_q2 \

--append \

--m 5 \

--split-by id

使用sqoop是可利用参数文件&＃xff0c;则不必手动输入各项参数

参数文件中可以指定密码文件路径&＃xff0c;而密码文件最好放在hdfs上

[root&＃64;hadoop1~]# echo -n "root" > /root/.password_mysql

[root&＃64;hadoop1~]# hadoop fs -mkdir -p /user/hadoop

[root&＃64;hadoop1~]# hadoop fs -put /root/.password_mysql /user/hadoop/

[root&＃64;hadoop1~]# hadoop fs -chmod 400 /user/hadoop/.password_mysql

[root&＃64;hadoop1~]# vi sqoop_import.cfg

import

--connect

jdbc:mysql://hadoop1:3306/pjh

--username

root

--password-file

/user/hadoop/.password_mysql

[root&＃64;hadoop1~]# vi sqoop_list_databases.cfg

list-databases

--connect

jdbc:mysql://hadoop1:3306/pjh

--username

root

--password-file

/user/hadoop/.password_mysql

[root&＃64;hadoop1~]# vi sqoop_list_tables.cfg

list-tables

--connect

jdbc:mysql://hadoop1:3306/pjh

--username

root

--password-file

/user/hadoop/.password_mysql

[root&＃64;hadoop1~]#

sqoop --options-file~/sqoop_import.cfg \

--tablet1 \

--target-dir/user/shared/t1_q3 \

--append\

--m1

[root&＃64;hadoop1 ~]# sqoop --options-file ~/sqoop_list_databases.cfg

[root&＃64;hadoop1 ~]#sqoop --options-file /root/sqoop_list_tables.cfg

[root&＃64;hadoop1 ~]# hadoopfs -cat /user/shared/t1_q3/*

导入时的分隔符设置

分隔符默认为逗号&＃xff0c;但是可以手动指定为其它字符&＃xff0c;例如 |

[root&＃64;hadoop1 ~]#

sqoop--options-file /root/sqoop_import.cfg \

--query "selectid,name from t1 where id<6 and \$CONDITIONS" \

--target-dir/user/shared/t1_q4 \

--append \

--m 1 \

--split-by id \

--fields-terminated-by "|"

[root&＃64;hadoop1~]# hadoopfs -ls /user/shared/t1_q4/

[root&＃64;hadoop1~]# hadoop fs -cat /user/shared/t1_q4/*

可使用--options-file标定所有参数

[root&＃64;hadoop1~]# vi sqoop_i_q5.cfg

import

--connect

jdbc:mysql://hadoop1:3306/pjh

--username

root

--password-file

/user/hadoop/.password_mysql

--query

selectid,name from t1 where id<7 and $CONDITIONS

##使用--opions-file时&＃xff0c;此处无需双引号&＃xff0c;$CONDITIONS前也无需正斜杠\

--target-dir

/user/shared/t1_q5

--split-by

[root&＃64;hadoop1~]# hadoop fs -rm -R /user/shared/t1_q5

[root&＃64;hadoop1~]# sqoop --options-file /root/sqoop_i_q5.cfg

四.Sqoop增量导入&＃xff08;mysql&＃xff09;

The merge tool istypically run after an incremental import with the date-last-modified mode

(sqoop import --incremental lastmodified …)

Note&＃xff1a;针对常规的增量update 实际上需要借助merge完成。

#初始化导入

[root&＃64;hadoop1~]# vi sqoop_i_q6_init.cfg

import

--connect

jdbc:mysql://hadoop1:3306/pjh

--username

root

--password-file

/user/hadoop/.password_mysql

--query

selectid,name,ts from t1 where id<8 and $CONDITIONS

##使用--opions-file时&＃xff0c;此处无需双引号&＃xff0c;$CONDITIONS前也无需正斜杠\

--target-dir

/user/shared/t1_q6

--split-by

[root&＃64;hadoop1~]# hadoop fs -rm -R /user/shared/t1_q6

[root&＃64;hadoop1~]# sqoop --options-file /root/sqoop_i_q6_init.cfg

[root&＃64;hadoop1~]# hadoop fs -cat /user/shared/t1_q6/*

增量导入的3个核心参数&＃xff1a;

Argument	Description
--check-column (cols)	Specifies the column to be examined when determining which rows to import. (the column should not be of type CHAR/NCHAR/VARCHAR/VARNCHAR/ LONGVARCHAR/LONGNVARCHAR)
--incremental (mode)	Specifies how Sqoop determines which rows are new. Legal values for mode include appendand lastmodified.
--last-value (value)	Specifies the maximum value of the check column from the previous import. 做增量的非必需参数

#测试--incrementalappend 做insert

[root&＃64;hadoop1~]# vi sqoop_i_q6_inc_append.cfg

import

--connect

jdbc:mysql://hadoop1:3306/pjh

--username

root

--password-file

/user/hadoop/.password_mysql

--query

selectid,name,ts from t1 where id<99 and$CONDITIONS

##使用--opions-file时&＃xff0c;此处无需双引号&＃xff0c;$CONDITIONS前也无需正斜杠\

--target-dir

/user/shared/t1_q6

--split-by

--check-column

--incremental

append

#做insert需要使用append

--last-value

2017-04-0708:38:34.0

##此处指定的值&＃xff0c;要与check column具有可比性&＃xff0c;将抽取大于该记录的行&＃xff0c;等于该值的行不会被抽取&＃xff0c;它是决定源记录是否被导入的重要依据。如果不指定--last-value,则会默认追加导入全部的行&＃xff0c;等效 --append

#在导入前&＃xff0c;在源表插入一条带有新ts的行&＃xff0c;注意query必须要能取到新加的行&＃xff0c;导入完成后将仅增加一行。并且会自动打印出下一次导入所需的--last-value值&＃xff0c;建议使用sqoop job来自动记录该值达到简化操作的目的&＃xff08;章节6&＃xff09;。

[root&＃64;hadoop1 ~]# sqoop --options-file /root/sqoop_q6_inc1_append.cfg

[root&＃64;hadoop1 ~]# hadoop fs -cat /user/shared/t1_q6/*

#测试--incrementallastmodified 做update

[root&＃64;hadoop1 ~]# visqoop_i_q6_inc_lastmodified.cfg

import

--connect

jdbc:mysql://hadoop1:3306/pjh

--username

root

--password-file

/user/hadoop/.password_mysql

--query

select id,name,ts fromt1 where id<99 and $CONDITIONS

##此处无需双引号&＃xff0c;$CONDITIONS前无需正斜杠\

--target-dir

/user/shared/t1_q6

--split-by

--check-column

#在使用lastmodified时&＃xff0c;--check-column必须是时间戳类型&＃xff0c;且必须指定&＃xff0c;但是我认为它并未起到任何作用。

--incremental

lastmodified

#--merge-key or--append is required when using --incremental lastmodified and the outputdirectory exists.

#做update需要使用lastmodified

--merge-key

#指定合并更新使用的key字段&＃xff0c;通常是唯一字段。

--last-value

2015-04-0708:38:34.0

#非必需

#在导入前&＃xff0c;在源表更新一行name&＃xff0c;注意query必须要能取到该行&＃xff0c;导入完成后将更新一行name.可以看到分成了两次job完成

[root&＃64;hadoop1~]# sqoop --options-file /root/sqoop_q6_inc_lastmodified.cfg

[root&＃64;hadoop1 ~]# hadoop fs -cat/user/shared/t1_q6/*

测试结果为何未体现&＃xff1f;&＃xff1f;&＃xff1f;

在导入前&＃xff0c;在源表更新一行name&＃xff0c;注意query必须要能取到该行&＃xff0c;导入完成后将更新一行name.可以看到分成了两次job完成

[root&＃64;hadoop1 ~]# sqoop--options-file /root/sqoop_i_q6_inc_append.cfg

[root&＃64;hadoop1~]# hadoop fs -cat /user/shared/t1_q6/*

在hdfs制造一条重复记录 id&＃61;2

sqoop import \

--connectjdbc:mysql://hadoop1:3306/pjh -username root -P \

--query "select id,name, &＃39;1999-04-0708:38:34.0&＃39; from t1 where id&＃61;2 and\$CONDITIONS" \

--target-dir/user/shared/t1_q6 \

--append \

--split-by id

在hdfs制造一条重复记录 id&＃61;1

sqoop import \

--connectjdbc:mysql://hadoop1:3306/pjh -username root -P \

--query "select id, &＃39;gopro&＃39;,&＃39;1999-04-0708:38:34.0&＃39; from t1 where id&＃61;1 and\$CONDITIONS" \

--target-dir/user/shared/t1_q6 \

--append \

--split-by id

发现&＃xff1a;

i)若源表出现了两行相同的id,而hdfs无重复&＃xff0c;会仅取rdbms中较早的一条记录更新至hdfs&＃xff0c;且与rdbms源表ts字段无关

ii)若hdfs先后插入重复id的行&＃xff0c;而rdbms无重复&＃xff0c;在导入之后&＃xff0c;hdfs仅会保存较早的一条&＃xff0c;与hdfs的ts字段无关。

iii)若hdfs先后插入重复id的行&＃xff0c;且rdbms也有该id的重复行&＃xff0c;在导入之后&＃xff0c;rdbms与hdfs均仅会保留最早的一条&＃xff0c;剩下的一行保持数据一致。

启动日志记录进程

[root&＃64;hadoop1 ~]# /hadoop-2.7.2/sbin/mr-jobhistory-daemon.shstart historyserver

使用lastmodified就可以同时做insert与update&＃xff01;但是在sqoop job中要注意&＃xff0c;如果仅仅在rdbms增加&更新记录&＃xff0c;而不去更新ts&＃xff0c;则除了新增行可以被抽取之外&＃xff0c;更新的行并不能被捕捉。因此作为RDBMS规范&＃xff0c;所有行的变更与新增都需要同时更新时间戳以便于ETL处理。

五.Sqoop导入实战&＃xff08;mysql&＃xff09;

[root&＃64;hadoop1 ~]# sqoop help export

[root&＃64;hadoop1 ~]# vi sqoop_export.cfg

export

--connect

jdbc:mysql://hadoop1:3306/pjh

--username

root

--password-file

/user/hadoop/.password_mysql

#查看hdfs源数据&＃xff1a;

[root&＃64;hadoop1 ~]# hadoop fs -cat /user/shared/t1_q6/*

#由hdfs向rdbms出表&＃xff0c;前提是&＃xff1a;目标表t2必须先创建&＃xff0c;并且字段结构要与导出源一致。如果多次导出&＃xff0c;是以追加方式写入。

[root&＃64;hadoop1~]#

sqoop --options-file~/sqoop_export.cfg\

--export-dir/user/shared/t1_q6 \

--table t2 \

--input-null-string &＃39;\\N&＃39;

–col1,col2,col3

注意&＃xff1a;没有被包含在–columns字段必须有默认值&＃xff0c;或者允许插入空值&＃xff0c;否则rdbms会拒绝sqoop导出的数据&＃xff0c;导致作业失败

–export-dir&＃xff0c;导出目录&＃xff0c;在执行导出的时候&＃xff0c;必须指定这个参数&＃xff0c;同时需要具备–table或–call参数两者之一&＃xff0c;–table是指的导出数据库当中对应的表&＃xff0c;–call是指的某个存储过程

–input-null-string、–input-null-non-string&＃xff0c;这两参数是可选&＃xff0c;如果没有指定第一个参数&＃xff0c;对于字符串类型的列来说&＃xff0c;“NULL”这个字符串就回被翻译成空值&＃xff0c;如果没有使用第二个参数&＃xff0c;无论是“NULL”字符串还是说空字符串也好&＃xff0c;对于非字符串类型的字段来说&＃xff0c;这两个类型的空串都会被翻译成空值

--update-key&＃xff0c;更新标识&＃xff0c;即根据某个字段进行更新&＃xff0c;例如id&＃xff0c;可以指定多个更新标识的&＃xff0c;用逗号分隔

--update-mode&＃xff0c;有两种模式&＃xff0c;一种是updateonly&＃xff08;默认模式&＃xff09;&＃xff0c;仅仅更新已存在的数据记录&＃xff0c;不会插入新纪录&＃xff0c;另一种模式是allowinsert&＃xff0c;允许插入新纪录

Sqoop的Export工具&＃xff0c;对应两种语句&＃xff0c;一种是Insert语句&＃xff0c;如果表当中存在PK约束&＃xff0c;且表中已包含数据&＃xff0c;此时&＃xff0c;导出报错。此时需要用到—update-key和updatemod

如果指定了update-key&＃xff0c;那么Sqoop就会修改在数据表中已存在的数据&＃xff0c;此时的每一个更行数据记录都会变成一个Update语句&＃xff0c;用这个语句去更新目标表中已存在的数据&＃xff0c;这是根据–update-key所指定的这个列进行更新的。Update set teacher&＃61;”MarsJ” where class_id &＃61; 1。

1&＃xff09;若update-key所指定的字段不是PK字段&＃xff0c;若同时updatemod使用updateonly模式时&＃xff0c;就仅进行更新&＃xff0c;若updatemod使用allowinsert模式&＃xff0c;那么实质上就是一个insert操作.

2&＃xff09;若update-key所指定的字段是PK字段&＃xff0c;同时updatemod是allowinsert时&＃xff0c;实质上是一个insert & update的操作&＃xff0c;若updatemod是updateonly时&＃xff0c;实质仅为update操作.

[root&＃64;hadoop1 ~]#

sqoop --options-file ~/sqoop_export.cfg \

--export-dir /user/shared/t1_q6 \

--table t2 \

--update-key id \

--update-mode updateonly

#若rdbms存在重复的id&＃xff08;没有pk&＃xff0c;通常是不规范的情况&＃xff09;&＃xff0c;hdfs也存在相同的重复id&＃xff0c;更新后rdbms会保留所有的行&＃xff0c;并参照hdfs保留最近的一行进行update。

#若rdbms有pk&＃xff0c;hdfs存在相同的重复id&＃xff0c;更新后并参照hdfs最早的一行进行update。

[root&＃64;hadoop1~]#

sqoop--options-file ~/sqoop_export.cfg \