我一直在尝试以 docker swarm 模式部署 Zookeeper 集群。
我已经部署了 3 台机器连接到 docker swarm 网络。我的要求是,尝试在每个节点上运行 3 个 Zookeeper 实例,从而形成整体。
浏览过这个帖子,对如何在 docker swarm 中部署 Zookeeper 了解甚少。
正如@junius 所建议的,我已经创建了 docker compose 文件。
当 docker swarm 忽略它时,我已经删除了约束。参考https://forums.docker.com/t/docker-swarm-constraints-being-ignored/31555
我的 Zookeeper docker compose 文件如下所示
version: '3.3'
services:
zoo1:
image: zookeeper:3.4.12
hostname: zoo1
ports:
- target: 2181
published: 2181
protocol: tcp
mode: host
- target: 2888
published: 2888
protocol: tcp
mode: host
- target: 3888
published: 3888
protocol: tcp
mode: host
networks:
- net
deploy:
restart_policy:
condition: on-failure
environment:
ZOO_MY_ID: 1
ZOO_SERVERS: server.1=0.0.0.0:2888:3888 server.2=zoo2:2888:3888 server.3=zoo3:2888:3888
volumes:
- /home/zk/data:/data
- /home/zk/datalog:/datalog
- /etc/localtime:/etc/localtime:ro
zoo2:
image: zookeeper:3.4.12
hostname: zoo2
ports:
- target: 2181
published: 2181
protocol: tcp
mode: host
- target: 2888
published: 2888
protocol: tcp
mode: host
- target: 3888
published: 3888
protocol: tcp
mode: host
networks:
- net
deploy:
restart_policy:
condition: on-failure
environment:
ZOO_MY_ID: 2
ZOO_SERVERS: server.1=zoo1:2888:3888 server.2=0.0.0.0:2888:3888 server.3=zoo3:2888:3888
volumes:
- /home/zk/data:/data
- /home/zk/datalog:/datalog
- /etc/localtime:/etc/localtime:ro
zoo3:
image: zookeeper:3.4.12
hostname: zoo3
ports:
- target: 2181
published: 2181
protocol: tcp
mode: host
- target: 2888
published: 2888
protocol: tcp
mode: host
- target: 3888
published: 3888
protocol: tcp
mode: host
networks:
- net
deploy:
restart_policy:
condition: on-failure
environment:
ZOO_MY_ID: 3
ZOO_SERVERS: server.1=zoo1:2888:3888 server.2=zoo2:2888:3888 server.3=0.0.0.0:2888:3888
volumes:
- /home/zk/data:/data
- /home/zk/datalog:/datalog
- /etc/localtime:/etc/localtime:ro
networks:
net:
使用 docker stack 命令部署。
docker stack deploy -c zoo3.yml zk
创建网络 zk_net
创建服务 zk_zoo3
创建服务 zk_zoo1
创建服务 zk_zoo2
Zookeeper 服务运行良好,每个节点中的每个服务都没有任何问题。
docker 堆栈服务 zk
ID 名称模式副本图像端口
rn7t5f3tu0r4 zk_zoo1 复制 1/1 zookeeper:3.4.12 0.0.0.0:2181->2181/tcp, 0.0.0.0:2888->2888/tcp, 0.0.0.0:3888->3888/tcp
u51r7bjwwm03 zk_zoo2 复制 1/1 zookeeper:3.4.12 0.0.0.0:2181->2181/tcp, 0.0.0.0:2888->2888/tcp, 0.0.0.0:3888->3888/tcp
zlbcocid57xz zk_zoo3 复制 1/1 zookeeper:3.4.12 0.0.0.0:2181->2181/tcp, 0.0.0.0:2888->2888/tcp, 0.0.0.0:3888->3888/tcp
当我停止并再次启动 zookeeper 堆栈时,我已经复制了这里讨论的这个问题。
码头工人堆栈rm zk
docker stack deploy -c zoo3.yml zk
这次 Zookeeper 集群没有形成。 docker 实例记录了以下内容
ZooKeeper JMX enabled by default
Using config: /conf/zoo.cfg
2018-11-02 15:24:41,531 [myid:2] - WARN [WorkerSender[myid=2]:QuorumCnxManager@584] - Cannot open channel to 1 at election address zoo1/10.0.0.4:3888
java.net.ConnectException: Connection refused (Connection refused)
at java.net.PlainSocketImpl.socketConnect(Native Method)
at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.net.Socket.connect(Socket.java:589)
at org.apache.zookeeper.server.quorum.QuorumCnxManager.connectOne(QuorumCnxManager.java:558)
at org.apache.zookeeper.server.quorum.QuorumCnxManager.toSend(QuorumCnxManager.java:534)
at org.apache.zookeeper.server.quorum.FastLeaderElection$Messenger$WorkerSender.process(FastLeaderElection.java:454)
at org.apache.zookeeper.server.quorum.FastLeaderElection$Messenger$WorkerSender.run(FastLeaderElection.java:435)
at java.lang.Thread.run(Thread.java:748)
2018-11-02 15:24:41,538 [myid:2] - WARN [WorkerSender[myid=2]:QuorumCnxManager@584] - Cannot open channel to 3 at election address zoo3/10.0.0.2:3888
java.net.ConnectException: Connection refused (Connection refused)
at java.net.PlainSocketImpl.socketConnect(Native Method)
at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.net.Socket.connect(Socket.java:589)
at org.apache.zookeeper.server.quorum.QuorumCnxManager.connectOne(QuorumCnxManager.java:558)
at org.apache.zookeeper.server.quorum.QuorumCnxManager.toSend(QuorumCnxManager.java:534)
at org.apache.zookeeper.server.quorum.FastLeaderElection$Messenger$WorkerSender.process(FastLeaderElection.java:454)
at org.apache.zookeeper.server.quorum.FastLeaderElection$Messenger$WorkerSender.run(FastLeaderElection.java:435)
at java.lang.Thread.run(Thread.java:748)
2018-11-02 15:38:19,146 [myid:2] - WARN [QuorumPeer[myid=2]/0.0.0.0:2181:Learner@237] - Unexpected exception, tries=1, connecting to /0.0.0.0:2888
java.net.ConnectException: Connection refused (Connection refused)
at java.net.PlainSocketImpl.socketConnect(Native Method)
at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:204)
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.net.Socket.connect(Socket.java:589)
at org.apache.zookeeper.server.quorum.Learner.connectToLeader(Learner.java:229)
at org.apache.zookeeper.server.quorum.Follower.followLeader(Follower.java:72)
at org.apache.zookeeper.server.quorum.QuorumPeer.run(QuorumPeer.java:981)
2018-11-02 15:38:20,147 [myid:2] - WARN [QuorumPeer[myid=2]/0.0.0.0:2181:Learner@237] - Unexpected exception, tries=2, connecting to /0.0.0.0:2888
java.net.ConnectException: Connection refused (Connection refused)
at java.net.PlainSocketImpl.socketConnect(Native Method)
at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:204)
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.net.Socket.connect(Socket.java:589)
at org.apache.zookeeper.server.quorum.Learner.connectToLeader(Learner.java:229)
at org.apache.zookeeper.server.quorum.Follower.followLeader(Follower.java:72)
at org.apache.zookeeper.server.quorum.QuorumPeer.run(QuorumPeer.java:981)
仔细观察发现,当我第一次部署此堆栈时,具有 id: 2 的 ZooKeeper 实例在节点 1 上运行。这创建了一个值为 2 的 myid 文件。
cat /home/zk/data/myid
2
当我停止并再次启动堆栈时,我发现这一次,ID:3 的 ZooKeeper 实例在节点 1 上运行。
码头工人ps
容器 ID 图像命令创建状态端口名称
566b68c11c8b zookeeper:3.4.12 "/docker-entrypoin..." 6 分钟前 上升 6 分钟 0.0.0.0:2181->2181/tcp, 0.0.0.0:2888->2888/tcp, 0.0.0.0:3888-> 3888/tcp zk_zoo3.1.7m0hq684pkmyrm09zmictc5bm
但是 myid 文件的值仍然是 2,它是由之前的实例设置的。
因此日志显示 [myid:2] 并尝试连接到 id 为 1 和 3 的实例并失败。
进一步调试发现docker-entrypoint.sh文件包含如下代码
# Write myid only if it doesn't exist
if [[ ! -f "$ZOO_DATA_DIR/myid" ]]; then
echo "${ZOO_MY_ID:-1}" > "$ZOO_DATA_DIR/myid"
fi
这对我来说是个问题。我已经用以下内容编辑了 docker-entrypoint.sh,
if [[ -f "$ZOO_DATA_DIR/myid" ]]; then
rm "$ZOO_DATA_DIR/myid"
fi
echo "${ZOO_MY_ID:-1}" > "$ZOO_DATA_DIR/myid"
并将 docker-entrypoint.sh 挂载到我的撰写文件中。
通过此修复,我能够多次停止和启动我的堆栈,并且每次我的 zookeeper 集群能够形成集成而不会遇到连接问题。
我的docker-entrypoint.sh文件如下
#!/bin/bash
set -e
# Allow the container to be started with `--user`
if [[ "$1" = 'zkServer.sh' && "$(id -u)" = '0' ]]; then
chown -R "$ZOO_USER" "$ZOO_DATA_DIR" "$ZOO_DATA_LOG_DIR"
exec su-exec "$ZOO_USER" "$0" "$@"
fi
# Generate the config only if it doesn't exist
if [[ ! -f "$ZOO_CONF_DIR/zoo.cfg" ]]; then
CONFIG="$ZOO_CONF_DIR/zoo.cfg"
echo "clientPort=$ZOO_PORT" >> "$CONFIG"
echo "dataDir=$ZOO_DATA_DIR" >> "$CONFIG"
echo "dataLogDir=$ZOO_DATA_LOG_DIR" >> "$CONFIG"
echo "tickTime=$ZOO_TICK_TIME" >> "$CONFIG"
echo "initLimit=$ZOO_INIT_LIMIT" >> "$CONFIG"
echo "syncLimit=$ZOO_SYNC_LIMIT" >> "$CONFIG"
echo "maxClientCnxns=$ZOO_MAX_CLIENT_CNXNS" >> "$CONFIG"
for server in $ZOO_SERVERS; do
echo "$server" >> "$CONFIG"
done
fi
if [[ -f "$ZOO_DATA_DIR/myid" ]]; then
rm "$ZOO_DATA_DIR/myid"
fi
echo "${ZOO_MY_ID:-1}" > "$ZOO_DATA_DIR/myid"
exec "$@"
我的docker compose文件如下
version: '3.3'
services:
zoo1:
image: zookeeper:3.4.12
hostname: zoo1
ports:
- target: 2181
published: 2181
protocol: tcp
mode: host
- target: 2888
published: 2888
protocol: tcp
mode: host
- target: 3888
published: 3888
protocol: tcp
mode: host
networks:
- net
deploy:
restart_policy:
condition: on-failure
environment:
ZOO_MY_ID: 1
ZOO_SERVERS: server.1=0.0.0.0:2888:3888 server.2=zoo2:2888:3888 server.3=zoo3:2888:3888
volumes:
- /home/zk/data:/data
- /home/zk/datalog:/datalog
- /home/zk/docker-entrypoint.sh:/docker-entrypoint.sh
- /etc/localtime:/etc/localtime:ro
zoo2:
image: zookeeper:3.4.12
hostname: zoo2
ports:
- target: 2181
published: 2181
protocol: tcp
mode: host
- target: 2888
published: 2888
protocol: tcp
mode: host
- target: 3888
published: 3888
protocol: tcp
mode: host
networks:
- net
deploy:
restart_policy:
condition: on-failure
environment:
ZOO_MY_ID: 2
ZOO_SERVERS: server.1=zoo1:2888:3888 server.2=0.0.0.0:2888:3888 server.3=zoo3:2888:3888
volumes:
- /home/zk/data:/data
- /home/zk/datalog:/datalog
- /home/zk/docker-entrypoint.sh:/docker-entrypoint.sh
- /etc/localtime:/etc/localtime:ro
zoo3:
image: zookeeper:3.4.12
hostname: zoo3
ports:
- target: 2181
published: 2181
protocol: tcp
mode: host
- target: 2888
published: 2888
protocol: tcp
mode: host
- target: 3888
published: 3888
protocol: tcp
mode: host
networks:
- net
deploy:
restart_policy:
condition: on-failure
environment:
ZOO_MY_ID: 3
ZOO_SERVERS: server.1=zoo1:2888:3888 server.2=zoo2:2888:3888 server.3=0.0.0.0:2888:3888
volumes:
- /home/zk/data:/data
- /home/zk/datalog:/datalog
- /home/zk/docker-entrypoint.sh:/docker-entrypoint.sh
- /etc/localtime:/etc/localtime:ro
networks:
net:
有了这个,我可以使用 swarm 模式在 docker 中启动并运行 zookeeper 实例,而无需在 compose 文件中对任何主机名进行硬编码。如果我的一个节点出现故障,则会在 swarm 上的任何可用节点上启动服务,而不会出现任何问题。
谢谢