暂无图片
暂无图片
暂无图片
暂无图片
暂无图片

DataX下载安装使用详细步骤

编程与架构 2023-05-30
151

介绍

  • 下载

  • 执行同步的组件

  • 配置数据同步

    • 查看官方读写配置样例

    • 创建Hbase和Phoenix表

    • 创建ClickHouse表

    • 写入ClickHouse测试数据

    • 编写ClickHouse2Hbase配置文件

    • 执行同步命令

  • 拓展

    • ClickHouse同步到MySQL配置文件


01.Clickhouse到HBase(Phoenix)数据导入 DataX

介绍

DataX 是一个异构数据源离线同步工具,致力于实现包括关系型数据库(MySQL、Oracle等)、HDFS、Hive、ODPS、HBase、FTP等各种异构数据源之间稳定高效的数据同步功能。

下载

# 官方版本
wget http://datax-opensource.oss-cn-hangzhou.aliyuncs.com/datax.tar.gz

# 编译好ClickHouse和Hbase2的版本
链接:https://pan.baidu.com/s/1IYU93oGOnvcx34HJaPDudQ
提取码:bool

解压

tar -zxvf datax.tar.gz

解压后目录结构如下

[root@dev-bigdata-24-146 datax]# ls
bin conf job lib log log_perf plugin script tmp

执行同步的组件

reader下是支持读的组件

writer下是支持写的组件

[root@dev-bigdata-24-146 datax]# tree -L 2 plugin/
plugin/
├── reader
│ ├── cassandrareader
│ ├── clickhousereader
│ ├── dbffilereader
│ ├── drdsreader
│ ├── elasticsearchreader
│ ├── ftpreader
│ ├── gaussdbreader
│ ├── gbasereader
│ ├── gdbreader
│ ├── hbase094xreader
│ ├── hbase11xkerberosreader
│ ├── hbase11xreader
│ ├── hbase11xsqlreader
│ ├── hbase20xsqlreader
│ ├── hdfsreader
│ ├── httpreader
│ ├── influxdbreader
│ ├── jsonfilereader
│ ├── kingbaseesreader
│ ├── kudureader
│ ├── mongodbreader
│ ├── mysql8reader
│ ├── mysqlreader
│ ├── odpsreader
│ ├── opentsdbreader
│ ├── oraclereader
│ ├── oscarreader
│ ├── ossreader
│ ├── otsreader
│ ├── otsstreamreader
│ ├── postgresqlreader
│ ├── rdbmsreader
│ ├── redisreader
│ ├── sqlserverreader
│ ├── streamreader
│ ├── tdenginereader
│ └── txtfilereader
└── writer
├── adbpgwriter
├── adswriter
├── cassandrawriter
├── clickhousewriter
├── dbffilewriter
├── dorisdbwriter
├── doriswriter
├── drdswriter
├── elasticsearchwriter
├── ftpwriter
├── gaussdbwriter
├── gbasewriter
├── gdbwriter
├── greenplumwriter
├── hbase094xwriter
├── hbase11xkerberoswriter
├── hbase11xsqlwriter
├── hbase11xwriter
├── hbase20xsqlwriter
├── hdfswriter
├── influxdbwriter
├── kingbaseeswriter
├── kuduwriter
├── mongodbwriter
├── mysql8writer
├── mysqlwriter
├── ocswriter
├── odpswriter
├── oraclewriter
├── oscarwriter
├── osswriter
├── otswriter
├── postgresqlwriter
├── rdbmswriter
├── rediswriter
├── sqlserverwriter
├── streamwriter
├── tdenginewriter
├── tsdbwriter
└── txtfilewriter

配置数据同步

本文介绍使用的是CDH6.3.2所带的之间HBase 版本2.1.0版本,需要安装Phoenix

Phoenix安装方法参考:https://datamining.blog.csdn.net/article/details/105572156

查看官方读写配置样例

命令:python bin/datax.py -r clickhousereader -w hbase20xsqlwriter

[root@jast datax]# python bin/datax.py -r clickhousereader -w hbase20xsqlwriter

DataX (DATAX-OPENSOURCE-3.0), From Alibaba !
Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved.


Please refer to the clickhousereader document:
https://github.com/alibaba/DataX/blob/master/clickhousereader/doc/clickhousereader.md

Please refer to the hbase20xsqlwriter document:
https://github.com/alibaba/DataX/blob/master/hbase20xsqlwriter/doc/hbase20xsqlwriter.md

Please save the following configuration as a json file and use
python {DATAX_HOME}/bin/datax.py {JSON_FILE_NAME}.json
to run the job.

{
"job": {
"content": [
{
"reader": {
"name": "clickhousereader",
"parameter": {
"column": [],
"connection": [
{
"jdbcUrl": [],
"table": []
}
],
"password": "",
"username": "",
"where": ""
}
},
"writer": {
"name": "hbase20xsqlwriter",
"parameter": {
"batchSize": "100",
"column": [],
"nullMode": "skip",
"queryServerAddress": "",
"schema": "",
"serialization": "PROTOBUF",
"table": ""
}
}
}
],
"setting": {
"speed": {
"channel": ""
}
}
}
}


创建Hbase和Phoenix表

DataX中Hbase2是通过Phoenix插入,所以也要同步创建Phoenix表

Hbase表创建

create 'test_datax',{NAME=>'cf'}

Phoenix表创建

create table "test_datax"("rowkey" varchar primary key, "cf"."data_id" varchar , "cf"."user_id" varchar, "cf"."name" varchar, "cf"."phone" varchar, "cf"."pt" varchar) column_encoded_bytes=0;

创建ClickHouse表

create table test_datax_ck
(
data_id String,
user_id Nullable(String),
name Nullable(String),
phone Nullable(String),
pt String
)
engine = MergeTree PARTITION BY pt
ORDER BY (pt)
SETTINGS index_granularity = 8192;

写入ClickHouse测试数据

insert into test_datax_ck("data_id","user_id","name","phone","pt") values ('1','1','张三1',13577665544,'20210101');
insert into test_datax_ck("data_id","user_id","name","phone","pt") values ('2','2','张三2',13577665546,'20210101');
insert into test_datax_ck("data_id","user_id","name","phone","pt") values ('3','3','张三3',13577665545,'20210101');
insert into test_datax_ck("data_id","user_id","name","phone","pt") values ('4','4','张三4',13577665543,'20210102');

编写ClickHouse2Hbase配置文件

创建配置文件vim ck2hbase.json
内容如下

{
"job":{
"content":[
{
"reader":{
"name":"clickhousereader",
"parameter":{
"column":[
"data_id",
"data_id",
"user_id",
"name",
"phone",
"pt"
],
"connection":[
{
"jdbcUrl":[
"jdbc:clickhouse://192.16.24.150:8123/ms_db?socket_timeout=7200000"
],
"table":[
"test_datax_ck"
]
}
],
"password":"root

@

  • 介绍

  • 下载

  • 执行同步的组件

  • 配置数据同步

    • 查看官方读写配置样例

    • 创建Hbase和Phoenix表

    • 创建ClickHouse表

    • 写入ClickHouse测试数据

    • 编写ClickHouse2Hbase配置文件

    • 执行同步命令

  • 拓展

    • ClickHouse同步到MySQL配置文件


01.Clickhouse到HBase(Phoenix)数据导入 DataX

介绍

DataX 是一个异构数据源离线同步工具,致力于实现包括关系型数据库(MySQL、Oracle等)、HDFS、Hive、ODPS、HBase、FTP等各种异构数据源之间稳定高效的数据同步功能。

下载

# 官方版本
wget http://datax-opensource.oss-cn-hangzhou.aliyuncs.com/datax.tar.gz

# 编译好ClickHouse和Hbase2的版本
链接:https://pan.baidu.com/s/1IYU93oGOnvcx34HJaPDudQ
提取码:bool

解压

tar -zxvf datax.tar.gz

解压后目录结构如下

[root@dev-bigdata-24-146 datax]# ls
bin conf job lib log log_perf plugin script tmp

执行同步的组件

reader下是支持读的组件

writer下是支持写的组件

[root@dev-bigdata-24-146 datax]# tree -L 2 plugin/
plugin/
├── reader
│ ├── cassandrareader
│ ├── clickhousereader
│ ├── dbffilereader
│ ├── drdsreader
│ ├── elasticsearchreader
│ ├── ftpreader
│ ├── gaussdbreader
│ ├── gbasereader
│ ├── gdbreader
│ ├── hbase094xreader
│ ├── hbase11xkerberosreader
│ ├── hbase11xreader
│ ├── hbase11xsqlreader
│ ├── hbase20xsqlreader
│ ├── hdfsreader
│ ├── httpreader
│ ├── influxdbreader
│ ├── jsonfilereader
│ ├── kingbaseesreader
│ ├── kudureader
│ ├── mongodbreader
│ ├── mysql8reader
│ ├── mysqlreader
│ ├── odpsreader
│ ├── opentsdbreader
│ ├── oraclereader
│ ├── oscarreader
│ ├── ossreader
│ ├── otsreader
│ ├── otsstreamreader
│ ├── postgresqlreader
│ ├── rdbmsreader
│ ├── redisreader
│ ├── sqlserverreader
│ ├── streamreader
│ ├── tdenginereader
│ └── txtfilereader
└── writer
├── adbpgwriter
├── adswriter
├── cassandrawriter
├── clickhousewriter
├── dbffilewriter
├── dorisdbwriter
├── doriswriter
├── drdswriter
├── elasticsearchwriter
├── ftpwriter
├── gaussdbwriter
├── gbasewriter
├── gdbwriter
├── greenplumwriter
├── hbase094xwriter
├── hbase11xkerberoswriter
├── hbase11xsqlwriter
├── hbase11xwriter
├── hbase20xsqlwriter
├── hdfswriter
├── influxdbwriter
├── kingbaseeswriter
├── kuduwriter
├── mongodbwriter
├── mysql8writer
├── mysqlwriter
├── ocswriter
├── odpswriter
├── oraclewriter
├── oscarwriter
├── osswriter
├── otswriter
├── postgresqlwriter
├── rdbmswriter
├── rediswriter
├── sqlserverwriter
├── streamwriter
├── tdenginewriter
├── tsdbwriter
└── txtfilewriter

配置数据同步

本文介绍使用的是CDH6.3.2所带的之间HBase 版本2.1.0版本,需要安装Phoenix

Phoenix安装方法参考:https://datamining.blog.csdn.net/article/details/105572156

查看官方读写配置样例

命令:python bin/datax.py -r clickhousereader -w hbase20xsqlwriter

[root@jast datax]# python bin/datax.py -r clickhousereader -w hbase20xsqlwriter

DataX (DATAX-OPENSOURCE-3.0), From Alibaba !
Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved.


Please refer to the clickhousereader document:
https://github.com/alibaba/DataX/blob/master/clickhousereader/doc/clickhousereader.md

Please refer to the hbase20xsqlwriter document:
https://github.com/alibaba/DataX/blob/master/hbase20xsqlwriter/doc/hbase20xsqlwriter.md

Please save the following configuration as a json file and use
python {DATAX_HOME}/bin/datax.py {JSON_FILE_NAME}.json
to run the job.

{
"job": {
"content": [
{
"reader": {
"name": "clickhousereader",
"parameter": {
"column": [],
"connection": [
{
"jdbcUrl": [],
"table": []
}
],
"password": "",
"username": "",
"where": ""
}
},
"writer": {
"name": "hbase20xsqlwriter",
"parameter": {
"batchSize": "100",
"column": [],
"nullMode": "skip",
"queryServerAddress": "",
"schema": "",
"serialization": "PROTOBUF",
"table": ""
}
}
}
],
"setting": {
"speed": {
"channel": ""
}
}
}
}


创建Hbase和Phoenix表

DataX中Hbase2是通过Phoenix插入,所以也要同步创建Phoenix表

Hbase表创建

create 'test_datax',{NAME=>'cf'}

Phoenix表创建

create table "test_datax"("rowkey" varchar primary key, "cf"."data_id" varchar , "cf"."user_id" varchar, "cf"."name" varchar, "cf"."phone" varchar, "cf"."pt" varchar) column_encoded_bytes=0;

创建ClickHouse表

create table test_datax_ck
(
data_id String,
user_id Nullable(String),
name Nullable(String),
phone Nullable(String),
pt String
)
engine = MergeTree PARTITION BY pt
ORDER BY (pt)
SETTINGS index_granularity = 8192;

写入ClickHouse测试数据

insert into test_datax_ck("data_id","user_id","name","phone","pt") values ('1','1','张三1',13577665544,'20210101');
insert into test_datax_ck("data_id","user_id","name","phone","pt") values ('2','2','张三2',13577665546,'20210101');
insert into test_datax_ck("data_id","user_id","name","phone","pt") values ('3','3','张三3',13577665545,'20210101');
insert into test_datax_ck("data_id","user_id","name","phone","pt") values ('4','4','张三4',13577665543,'20210102');

编写ClickHouse2Hbase配置文件

创建配置文件vim ck2hbase.json
内容如下

{
"job":{
"content":[
{
"reader":{
"name":"clickhousereader",
"parameter":{
"column":[
"data_id",
"data_id",
"user_id",
"name",
"phone",
"pt"
],
"connection":[
{
"jdbcUrl":[
"jdbc:clickhouse://192.16.24.150:8123/ms_db?socket_timeout=7200000"
],
"table":[
"test_datax_ck"
]
}
],
"password":"root",
"username":"default",
"where":""
}
},
"writer":{
"name":"hbase20xsqlwriter",
"parameter":{
"batchSize":"100",
"column":[
"rowkey",
"data_id",
"user_id",
"name",
"phone",
"pt"
],
"nullMode":"skip",
"queryServerAddress":"http://192.1.24.146:8765",
"serialization":"PROTOBUF",
"table":"test_datax"
}
}
}
],
"setting":{
"speed":{
"channel":1
}
}
}
}

执行同步命令

python bin/datax.py ck2hbase.json

2022-04-29 15:31:51.894 [job-0] INFO JobContainer -
任务启动时刻 : 2022-04-29 15:31:48
任务结束时刻 : 2022-04-29 15:31:51
任务总计耗时 : 2s
任务平均流量 : 176B/s
记录写入速度 : 12rec/s
读出记录总数 : 12
读写失败总数 : 0

拓展

ClickHouse同步到MySQL配置文件

{
"job": {
"content": [
{
"reader": {
"name": "clickhousereader",
"parameter": {
"column": [
"one_id",
"aid",
"name",
"phone"
],
"connection": [
{
"jdbcUrl": ["jdbc:clickhouse://192.1.24.171:8123/default?socket_timeout=7200000"],
"table": ["ads_user_test"]
}
],
"password": "",
"username": "",
"where": ""
}
},
"writer": {
"name": "mysqlwriter",
"parameter": {
"print": true,
"column": [
"one_id",
"aid",
"name",
"phone"
],
"connection": [
{
"jdbcUrl": ["jdbc:mysql://192.1.24.143:3306/test"],
"table": ["test20220429"]
}
],
"password": "root",
"preSql": [],
"session": [],
"username": "root",
"writeMode": ""
}
}
}
],
"setting": {
"speed": {
"channel": ""
}
}
}
}


",
"username":"default",
"where":""
}
},
"writer":{
"name":"hbase20xsqlwriter",
"parameter":{
"batchSize":"100",
"column":[
"rowkey",
"data_id",
"user_id",
"name",
"phone",
"pt"
],
"nullMode":"skip",
"queryServerAddress":"http://192.1.24.146:8765",
"serialization":"PROTOBUF",
"table":"test_datax"
}
}
}
],
"setting":{
"speed":{
"channel":1
}
}
}
}

执行同步命令

python bin/datax.py ck2hbase.json

2022-04-29 15:31:51.894 [job-0] INFO JobContainer -
任务启动时刻 : 2022-04-29 15:31:48
任务结束时刻 : 2022-04-29 15:31:51
任务总计耗时 : 2s
任务平均流量 : 176B/s
记录写入速度 : 12rec/s
读出记录总数 : 12
读写失败总数 : 0

拓展

ClickHouse同步到MySQL配置文件

{
"job": {
"content": [
{
"reader": {
"name": "clickhousereader",
"parameter": {
"column": [
"one_id",
"aid",
"name",
"phone"
],
"connection": [
{
"jdbcUrl": ["jdbc:clickhouse://192.1.24.171:8123/default?socket_timeout=7200000"],
"table": ["ads_user_test"]
}
],
"password": "",
"username": "",
"where": ""
}
},
"writer": {
"name": "mysqlwriter",
"parameter": {
"print": true,
"column": [
"one_id",
"aid",
"name",
"phone"
],
"connection": [
{
"jdbcUrl": ["jdbc:mysql://192.1.24.143:3306/test"],
"table": ["test20220429"]
}
],
"password": "root",
"preSql": [],
"session": [],
"username": "root",
"writeMode": ""
}
}
}
],
"setting": {
"speed": {
"channel": ""
}
}
}
}



文章转载自编程与架构,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。

评论