clickhouse

列式数据库,在十亿量级的查询中,查询7天内数据,可以在1s内返回

可以应用在日志、数据分析、埋点领域

官方地址

数据类型

目录示例

.
├── config
│   ├── config.xml
│   ├── server.crt
│   └── server.key
│   └── users.xml
└── setup.sh

启动

单节点

#!/bin/bash
echo "Asia/Shanghai" > /etc/timezone

docker stop clickhouse
docker rm clickhouse
docker run -d --name clickhouse \
--restart=always \
--network host \
-m 4g \
--add-host localhost:10.0.18.2 \
-v /etc/localtime:/etc/localtime:ro \
-v /etc/timezone:/etc/timezone:ro \
-e TZ='Asia/Shanghai' \
--ulimit nofile=262144:262144 \
-v $(pwd)/data:/var/lib/clickhouse \
-v $(pwd)/config:/etc/clickhouse-server \
-v $(pwd)/initdb:/docker-entrypoint-initdb.d \
-v $(pwd)/log:/var/log/clickhouse-server \
clickhouse/clickhouse-server:23.12-alpine

# --volume=$(pwd)/scripts/init-db.sh:/docker-entrypoint-initdb.d/init-db.sh \
# docker logs -f clickhouse-server

# 证书需要自己生成
# openssl req -subj "/CN=localhost" -new -newkey rsa:2048 -days 365 -nodes -x509 -keyout $(pwd)/config/server.key -out $(pwd)/config/server.crt

配置文件

users
config
<clickhouse>
    <!-- See also the files in users.d directory where the settings can be overridden. -->

    
    <!-- Profiles of settings. -->
    <profiles>
        <!-- Default settings. -->
        <default>
            <max_memory_usage>2000000000</max_memory_usage>
        </default>

        <!-- Profile that allows only read queries. -->
        <readonly>
            <max_memory_usage>2000000000</max_memory_usage>
            <readonly>1</readonly>
        </readonly>
    </profiles>

    <!-- Users and ACL. -->
    <users>
        <!-- If user name was not specified, 'default' user is used. -->
        <default>
            <!-- See also the files in users.d directory where the password can be overridden.

                 Password could be specified in plaintext or in SHA256 (in hex format).

                 If you want to specify password in plaintext (not recommended), place it in 'password' element.
                 Example: <password>qwerty</password>.
                 Password could be empty.

                 If you want to specify SHA256, place it in 'password_sha256_hex' element.
                 Example: <password_sha256_hex>65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5</password_sha256_hex>
                 Restrictions of SHA256: impossibility to connect to ClickHouse using MySQL JS client (as of July 2019).

                 If you want to specify double SHA1, place it in 'password_double_sha1_hex' element.
                 Example: <password_double_sha1_hex>e395796d6546b1b65db9d665cd43f0e858dd4303</password_double_sha1_hex>

                 If you want to specify a previously defined LDAP server (see 'ldap_servers' in the main config) for authentication,
                  place its name in 'server' element inside 'ldap' element.
                 Example: <ldap><server>my_ldap_server</server></ldap>

                 If you want to authenticate the user via Kerberos (assuming Kerberos is enabled, see 'kerberos' in the main config),
                  place 'kerberos' element instead of 'password' (and similar) elements.
                 The name part of the canonical principal name of the initiator must match the user name for authentication to succeed.
                 You can also place 'realm' element inside 'kerberos' element to further restrict authentication to only those requests
                  whose initiator's realm matches it.
                 Example: <kerberos />
                 Example: <kerberos><realm>EXAMPLE.COM</realm></kerberos>

                 How to generate decent password:
                 Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-'
                 In first line will be password and in second - corresponding SHA256.

                 How to generate double SHA1:
                 Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-'
                 In first line will be password and in second - corresponding double SHA1.
            -->
            <password></password>

            <!-- List of networks with open access.

                 To open access from everywhere, specify:
                    <ip>::/0</ip>

                 To open access only from localhost, specify:
                    <ip>::1</ip>
                    <ip>127.0.0.1</ip>

                 Each element of list has one of the following forms:
                 <ip> IP-address or network mask. Examples: 213.180.204.3 or 10.0.0.1/8 or 10.0.0.1/255.255.255.0
                     2a02:6b8::3 or 2a02:6b8::3/64 or 2a02:6b8::3/ffff:ffff:ffff:ffff::.
                 <host> Hostname. Example: server01.clickhouse.com.
                     To check access, DNS query is performed, and all received addresses compared to peer address.
                 <host_regexp> Regular expression for host names. Example, ^server\d\d-\d\d-\d\.clickhouse\.com$
                     To check access, DNS PTR query is performed for peer address and then regexp is applied.
                     Then, for result of PTR query, another DNS query is performed and all received addresses compared to peer address.
                     Strongly recommended that regexp is ends with $
                 All results of DNS requests are cached till server restart.
            -->
            <networks>
                <ip>::/0</ip>
            </networks>

            <!-- Settings profile for user. -->
            <profile>default</profile>

            <!-- Quota for user. -->
            <quota>default</quota>

            <!-- User can create other users and grant rights to them. -->
            <access_management>1</access_management>
        </default>
        <root>
            <password></password>
            <access_management>1</access_management>
        </root>
    </users>

    <!-- Quotas. -->
    <quotas>
        <!-- Name of quota. -->
        <default>
            <!-- Limits for time interval. You could specify many intervals with different limits. -->
            <interval>
                <!-- Length of interval. -->
                <duration>3600</duration>

                <!-- No limits. Just calculate resource usage for time interval. -->
                <queries>0</queries>
                <errors>0</errors>
                <result_rows>0</result_rows>
                <read_rows>0</read_rows>
                <execution_time>0</execution_time>
            </interval>
        </default>
    </quotas>
</clickhouse>

设置密码

配置完成不需要重启数据库

  • users.xml中的password中输入明文
  • SHA256加密
# 生成密码
PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-'
<!-- 密码: YNnDE59y -->
<password_sha256_hex>dc1d38d778472ca940041646f961ffe71c8ba5d7f9ee0ea3855bed87087f66e6</password_sha256_hex>
PASSWORD=password; echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-'
password
5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8
  • 兼容MySQL客户端,双SHA1哈希加密
# centos7 安装 xxd 命令
yum install vim-common -y

# 生成密码
PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-'
<!-- 密码: XvUqGXVa -->
<password_double_sha1_hex>5f7192216a2511c6555ee6e9072a9a693de0eeb8</password_double_sha1_hex>
PASSWORD=password; echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-'
password
2470c0c06dee42fd1618bb99005adca2ec9d1e19

建表

nginx
service
mysql_slow_log
pg_slow_log
SET allow_experimental_object_type=1;
CREATE TABLE log.ngxlog
(
    `create_time` DateTime('Asia/Shanghai'),
    `kafka_offset` UInt64,
    `query_params_imsi` Nullable(String),
    `query_params_imei` Nullable(String),
    `query_params_hsman` Nullable(String),
    `query_params_hstype` Nullable(String),
    `query_params_exttype` Nullable(String),
    `nginx_time_local` String,
    `nginx_upstream_addr` String,
    `nginx_uri` String,
    `nginx_status` UInt16,
    `nginx_host` String,
    `mcc` Nullable(String),
    `hostname` String,
    `message` Nullable(String),
    `nginx_query_string` Nullable(String),
    `nginx_upstream_response_time` Float32,
    `nginx_request_time` Float32
)
ENGINE = MergeTree
PARTITION BY toYYYYMMDD(create_time)
ORDER BY create_time
TTL create_time + toIntervalDay(180)
SETTINGS index_granularity = 1024

ClickHouse-Kafka引擎使用

文档:https://clickhouse.com/docs/zh/engines/table-engines/integrations/kafka

作用

引擎表

引擎表是定义 kafka 的 json 字段和 clickhouse 字段的对应关系 1、时间使用 Nullable(String) 2、Object('json') 字段使用 Nullable(String) 3、多次消费数据注意修改 kafka_group_name

CREATE TABLE test.kafka_stream
(
    `id` Nullable(Int64),
    `create_time` Nullable(Int64),
    `format_time` Nullable(String),
    `name` Nullable(String),
    `data` Nullable(String)
)
ENGINE = Kafka
SETTINGS kafka_broker_list = '192.168.1.1:9092',
 kafka_topic_list = 'test',
 kafka_group_name = 'esxi-vm-clickhouse-60451-test-test-stream',
 kafka_format = 'JSONEachRow',
 kafka_num_consumers = 1;
数据表

数据表是实际存储数据的表 1、字段类型 Object('json') 需要 set allow_experimental_object_type = 1 2、DateTime 字段类型指定时区

set allow_experimental_object_type =1;
CREATE TABLE test.action_log
(
    `kafka_create_time` DateTime('Asia/Shanghai') DEFAULT now(),
    `kafka_offset` UInt128,
    `id` Nullable(Int64),
    `create_time` DateTime('Asia/Shanghai'),
    `format_time` DateTime('Asia/Shanghai'),
    `name` Nullable(String),
    `data` Object('json')
)
ENGINE = MergeTree
PARTITION BY toYYYYMMDD(kafka_create_time)
ORDER BY kafka_create_time
SETTINGS index_granularity = 8192;
视图表

视图表是将 kafka 引擎表转换为视图,通过 to $TABLE 将视图写入数据表 1、parseDateTimeBestEffort 函数自动解析时间 2、Object('json') 字段类型不能为 NULL,视图函数需要添加判断 3、时间戳字段可以通过 parseDateTimeBestEffort 函数直接入库为 DateTime 字段类型 4、_timestamp 和 _offset 为内置字段,详见官方文档

CREATE MATERIALIZED VIEW test.kafka_stream TO test.action_log AS
SELECT
    _timestamp AS kafka_create_time,
    _offset AS kafka_offset,
    id,
    parseDateTimeBestEffort(toString(create_time)) AS create_time,
    parseDateTimeBestEffort(format_time) AS format_time,
    name,
    if(data IS NULL, '{}', data) AS data
FROM test.kafka_stream;

数据迁移

查询结果导出csv

docker run --rm --name clickhouse-backup -i \
    --network=host \
    clickhouse/single/clickhouse-server:23.2.3 \
    clickhouse-client -h 172.168.1.1 \
    --max_memory_usage=100000000000 --query="SELECT * from tmp.test FORMAT CSVWithNames" > test.csv

导入csv

cat test_data.csv | docker run --rm --name clickhouse -i \
    --network=host \
    clickhouse/single/clickhouse-server:23.2.3 \
    clickhouse-client -h 172.168.1.1 \
    --user root --password password \
    --query="INSERT INTO tmp.test (id,phone) FORMAT CSVWithNames"

执行sql文件

docker run -i --rm --net host \
-v ./running.sql:/tmp/running.sql \
clickhouse/single/clickhouse-server:23.7.1-alpine \
clickhouse-client -h 127.0.0.1 --port 9000 \
--user default --password xxx \
--queries-file /tmp/running.sql > log/1.log

集群

由于集群配置有点多所以分开写了

.
├── config
│   ├── config.d
│   │   ├── cluster.xml
│   │   ├── default.xml
│   │   ├── disk.xml
│   │   └── zookeeper.xml
│   ├── config.xml
│   ├── users.d
│   │   ├── default-password.xml
│   │   └── root-password.xml
│   └── users.xml
└── setup.sh

zookeeper集群

#!/bin/bash
echo "Asia/Shanghai" > /etc/timezone

docker stop clickhouse
docker rm clickhouse
docker run -d --name clickhouse \
--restart=always \
--network host \
-m 2g \
--add-host localhost:127.0.0.1 \
-v /etc/localtime:/etc/localtime:ro \
-v /etc/timezone:/etc/timezone:ro \
-e TZ='Asia/Shanghai' \
--ulimit nofile=262144:262144 \
-v $(pwd)/data:/var/lib/clickhouse \
-v $(pwd)/config:/etc/clickhouse-server \
-v $(pwd)/initdb:/docker-entrypoint-initdb.d \
-v $(pwd)/log:/var/log/clickhouse-server \
-v /data2:/data2 \
clickhouse/clickhouse-server:23.9-alpine

# --volume=$(pwd)/scripts/init-db.sh:/docker-entrypoint-initdb.d/init-db.sh \
# docker logs -f clickhouse-server
# keeper 操作
# docker exec -it clickhouse clickhouse-keeper-client

# 证书需要自己生成
# openssl req -subj "/CN=localhost" -new -newkey rsa:2048 -days 365 -nodes -x509 -keyout $(pwd)/config/server.key -out $(pwd)/config/server.crt

配置文件

users.xml
config.xml
default.xml
disk.xml
zookeeper.xml
cluster.xml
default-password.xml
root-password.xml
<clickhouse>
    <!-- See also the files in users.d directory where the settings can be overridden. -->

    
    <!-- Profiles of settings. -->
    <profiles>
        <!-- Default settings. -->
        <default>
            <!-- 24 版本 使用 json -->
            <!-- <allow_experimental_json_type>1</allow_experimental_json_type> -->
            <allow_experimental_analyzer>1</allow_experimental_analyzer>
            <allow_experimental_object_type>1</allow_experimental_object_type>
            <!-- 内存配置 -->
            <max_memory_usage>28000000000</max_memory_usage>
        </default>
        <!-- Profile that allows only read queries. -->
        <readonly>
            <max_memory_usage>28000000000</max_memory_usage>
            <readonly>1</readonly>
        </readonly>
    </profiles>

    <!-- Quotas. -->
    <quotas>
        <!-- Name of quota. -->
        <default>
            <!-- Limits for time interval. You could specify many intervals with different limits. -->
            <interval>
                <!-- Length of interval. -->
                <duration>3600</duration>

                <!-- No limits. Just calculate resource usage for time interval. -->
                <queries>0</queries>
                <errors>0</errors>
                <result_rows>0</result_rows>
                <read_rows>0</read_rows>
                <execution_time>0</execution_time>
            </interval>
        </default>
    </quotas>
</clickhouse>

集群相关

集群表:

ON CLUSTER cluster_3shard

删除表:

DROP TABLE bigdata.table ON CLUSTER cluster_3shard SYNC;

删除数据:

ALTER TABLE action.local_user_action ON CLUSTER cluster_3shard DELETE WHERE create_time > toDateTime('2024-11-11 13:00:00');

用户操作

创建大数据只读账户

CREATE ROLE bigdata_role ON CLUSTER cluster_4shard;

GRANT ON CLUSTER cluster_4shard SELECT ON bigdata.* TO bigdata_role;

CREATE USER bigdata_user IDENTIFIED BY '${password}' ON CLUSTER cluster_4shard;
GRANT ON CLUSTER cluster_4shard bigdata_role TO bigdata_user;

技巧

远程迁移

- 从object('json')远程迁移到json
insert into action.tmp (create_time,kafka_offset,kafka_create_time,kafka_partition,kafka_topic,message)
select
    create_time,
    kafka_offset,
    kafka_create_time,
    kafka_partition,
    kafka_topic,
    toJSONString(message)
from remote('192.168.1.1:9000',action.tmp,'root','xxx')