// 第一个字段通常命名为key
CREATE EXTERNAL TABLE hivehbasetable(key INT,name STRING,age INT,gender STRING,clazz STRING,last_mod STRING
) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:name,cf1:age,cf1:gender,cf1:clazz,cf1:last_mod") TBLPROPERTIES("hbase.table.name" = "student");
create 'student','cf1'
select key,name from hivehbasetable limit 10;
hbase外部表 不能使用sqoop直接导入数据,必须通过例如:insert into这样的形式导入
因为sqoop导入数据,使用的原理是load data,load data只能在表的存储格式为textfile时,才能真正将数据加载到表中
DataX不需要依赖其他服务,直接上传、解压、安装、配置环境变量即可
也可以直接在windows上解压
# stream2stream.json
{"job": {"content": [{"reader": {"name": "streamreader","parameter": {"sliceRecordCount": 10,"column": [{"type": "long","value": "10"},{"type": "string","value": "hello,你好,世界-DataX"}]}},"writer": {"name": "streamwriter","parameter": {"encoding": "UTF-8","print": true}}}],"setting": {"speed": {"channel": 5}}}
}
datax.py stream2stream.json
需要新建student2数据库,并创建student表
{"job": {"content": [{"reader": {"name": "mysqlreader","parameter": {"username": "root","password": "123456","column": ["id","name","age","gender","clazz","last_mod"],"splitPk": "age","connection": [{"table": ["student"],"jdbcUrl": ["jdbc:mysql://master:3306/student"]}]}},"writer": {"name": "mysqlwriter","parameter": {"writeMode": "insert","username": "root","password": "123456","column": ["id","name","age","gender","clazz","last_mod"],"preSql": ["truncate student2"], "connection": [{"jdbcUrl": "jdbc:mysql://master:3306/student2?useUnicode=true&characterEncoding=utf8","table": ["student2"]}]}}}],"setting": {"speed": {"channel": 6}}}
}
datax.py mysql2mysql.json
写hive跟hdfs时一样的
{"job": {"content": [{"reader": {"name": "mysqlreader","parameter": {"username": "root","password": "123456","column": ["id","name","age","gender","clazz","last_mod"],"splitPk": "age","connection": [{"table": ["student"],"jdbcUrl": ["jdbc:mysql://master:3306/student"]}]}},"writer": {"name": "hdfswriter","parameter": {"defaultFS": "hdfs://master:9000","fileType": "text","path": "/user/hive/warehouse/datax.db/students","fileName": "student","column": [{"name": "id","type": "bigint"},{"name": "name","type": "string"},{"name": "age","type": "INT"},{"name": "gender","type": "string"},{"name": "clazz","type": "string"},{"name": "last_mod","type": "string"}],"writeMode": "append","fieldDelimiter": ","}}}],"setting": {"speed": {"channel": 6}}}
}
{"job": {"content": [{"reader": {"name": "hbase11xreader","parameter": {"hbaseConfig": {"hbase.zookeeper.quorum": "master:2181"},"table": "student","encoding": "utf-8","mode": "normal","column": [{"name": "rowkey","type": "string"},{"name": "cf1:name","type": "string"},{"name": "cf1:age","type": "string"},{"name": "cf1:gender","type": "string"},{"name": "cf1:clazz","type": "string"}],"range": {"startRowkey": "","endRowkey": "","isBinaryRowkey": false}}},"writer": {"name": "mysqlwriter","parameter": {"writeMode": "insert","username": "root","password": "123456","column": ["id","name","age","gender","clazz"],"preSql": ["truncate student2"], "connection": [{"jdbcUrl": "jdbc:mysql://master:3306/student2?useUnicode=true&characterEncoding=utf8","table": ["student2"]}]}}}],"setting": {"speed": {"channel": 6}}}
}
mysql中的score表需将cource_id改为course_id,并将student_id、course_id设为主键,并将所有字段的类型改为int
hbase需先创建score表:create ‘score’,‘cf1’
{"job": {"content": [{"reader": {"name": "mysqlreader","parameter": {"username": "root","password": "123456","column": ["student_id","course_id","score"],"splitPk": "course_id","connection": [{"table": ["score"],"jdbcUrl": ["jdbc:mysql://master:3306/student"]}]}},"writer": {"name": "hbase11xwriter","parameter": {"hbaseConfig": {"hbase.zookeeper.quorum": "master:2181"},"table": "score","mode": "normal","rowkeyColumn": [{"index":0,"type":"string"},{"index":-1,"type":"string","value":"_"},{"index":1,"type":"string"}],"column": [{"index":2,"name": "cf1:score","type": "int"}],"encoding": "utf-8"}}}],"setting": {"speed": {"channel": 6}}}
}
#Database
DB_HOST=master
DB_PORT=3306
DB_USERNAME=root
DB_PASSWORD=123456
DB_DATABASE=dataxweb
server:#port: 8080port: 8080
spring:#数据源datasource:# username: root#password: root#url: jdbc:mysql://localhost:3306/datax_web?serverTimezone=Asia/Shanghai&useLegacyDatetimeCode=false&useSSL=false&nullNamePatternMatchesAll=true&useUnicode=true&characterEncoding=UTF-8password: ${DB_PASSWORD:password}username: ${DB_USERNAME:username}url: jdbc:mysql://${DB_HOST:127.0.0.1}:${DB_PORT:3306}/${DB_DATABASE:dataxweb}?serverTimezone=Asia/Shanghai&useLegacyDatetimeCode=false&useSSL=false&nullNamePatternMatchesAll=true&useUnicode=true&characterEncoding=UTF-8driver-class-name: com.mysql.jdbc.Driverhikari:## 最小空闲连接数量minimum-idle: 5## 空闲连接存活最大时间,默认600000(10分钟)idle-timeout: 180000## 连接池最大连接数,默认是10maximum-pool-size: 10## 数据库连接超时时间,默认30秒,即30000connection-timeout: 30000connection-test-query: SELECT 1##此属性控制池中连接的最长生命周期,值0表示无限生命周期,默认1800000即30分钟max-lifetime: 1800000# datax-web emailmail:host: smtp.qq.comport: 25#username: xxx@qq.com#password: xxxusername: rootpassword: rootproperties:mail:smtp:auth: truestarttls:enable: truerequired: truesocketFactory:class: javax.net.ssl.SSLSocketFactorymanagement:health:mail:enabled: falseserver:servlet:context-path: /actuatormybatis-plus:# mapper.xml文件扫描mapper-locations: classpath*:/mybatis-mapper/*Mapper.xml# 实体扫描,多个package用逗号或者分号分隔#typeAliasesPackage: com.yibo.essyncclient.*.entityglobal-config:# 数据库相关配置db-config:# 主键类型 AUTO:"数据库ID自增", INPUT:"用户输入ID", ID_WORKER:"全局唯一ID (数字类型唯一ID)", UUID:"全局唯一ID UUID";id-type: AUTO# 字段策略 IGNORED:"忽略判断",NOT_NULL:"非 NULL 判断"),NOT_EMPTY:"非空判断"field-strategy: NOT_NULL# 驼峰下划线转换column-underline: true# 逻辑删除logic-delete-value: 0logic-not-delete-value: 1# 数据库类型db-type: mysqlbanner: false# mybatis原生配置configuration:map-underscore-to-camel-case: truecache-enabled: falsecall-setters-on-nulls: truejdbc-type-for-null: 'null'type-handlers-package: com.wugui.datax.admin.core.handler# 配置mybatis-plus打印sql日志
logging:#level:# com.wugui.datax.admin.mapper: info
#path: ./data/applogs/adminlevel:com.wugui.datax.admin.mapper: errorpath: ./applogs/admin#datax-job, access token
datax:job:accessToken:#i18n (default empty as chinese, "en" as english)i18n:## triggerpool max sizetriggerpool:fast:max: 200slow:max: 100### log retention dayslogretentiondays: 30datasource:aes:key: AD42F6697B035B75
# web port
server:port: 8081#port: 8081# log config
logging:config: classpath:logback.xmlpath: ./applogs/executor/jobhandler#path: ./data/applogs/executor/jobhandlerdatax:job:admin:### datax admin address list, such as "http://address" or "http://address01,http://address02"#addresses: http://127.0.0.1:8080addresses: http://127.0.0.1:8080executor:appname: datax-executorip:#port: 9999port: 9999### job log path#logpath: ./data/applogs/executor/jobhandlerlogpath: ./applogs/executor/jobhandler### job log retention dayslogretentiondays: 30### job, access tokenaccessToken:executor:#jsonpath: D:\\temp\\executor\\json\\jsonpath: ./json#pypath: F:\tools\datax\bin\datax.pypypath: C:\Users\zzk10\Documents\MacOS\DataIntegrate\datax\datax\bin\datax.py
注意pypath:这个路径需要解压datax.tar.gz到自己的路径,不要使用中文路径
解压后使用datax-web中的python3脚本替换掉 datax/bin/
只要进程没有自己停止,一直在后台运行即可
hive
需要先启动hiveserver2服务
下面这条命令需要在linux shell中执行
hive --service hiveserver2
改完pom文件,记得重新reimport,才会生效