【问题标题】:Worker container failed to connect back to Spark driver工作容器无法连接回 Spark 驱动程序
【发布时间】:2019-08-22 02:31:14
【问题描述】:

我将 Dockerfile 设置为:

# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License
ARG BASE_CONTAINER=jupyter/scipy-notebook
FROM $BASE_CONTAINER

LABEL maintainer="Jupyter Project <jupyter@googlegroups.com>"

USER root

# Spark dependencies
ENV SPARK_VERSION 2.3.2
ENV SPARK_HADOOP_PROFILE 2.7
ENV SPARK_SRC_URL https://www.apache.org/dist/spark/spark-$SPARK_VERSION/spark-${SPARK_VERSION}-bin-hadoop${SPARK_HADOOP_PROFILE}.tgz
ENV SPARK_HOME=/opt/spark
ENV PATH $PATH:$SPARK_HOME/bin

RUN apt-get update && \
     apt-get install -y openjdk-8-jdk-headless \
     postgresql && \
    rm -rf /var/lib/apt/lists/*
ENV JAVA_HOME  /usr/lib/jvm/java-8-openjdk-amd64/

ENV PATH $PATH:$JAVA_HOME/bin


RUN wget ${SPARK_SRC_URL}

RUN tar -xzf spark-${SPARK_VERSION}-bin-hadoop${SPARK_HADOOP_PROFILE}.tgz   

RUN mv spark-${SPARK_VERSION}-bin-hadoop${SPARK_HADOOP_PROFILE} /opt/spark 

RUN rm -f spark-${SPARK_VERSION}-bin-hadoop${SPARK_HADOOP_PROFILE}.tgz





USER $NB_UID
ENV POST_URL https://jdbc.postgresql.org/download/postgresql-42.2.5.jar
RUN wget ${POST_URL}
RUN mv postgresql-42.2.5.jar $SPARK_HOME/jars
# Install pyarrow
RUN conda install --quiet -y 'pyarrow' && \
    conda install pyspark==2.3.2 && \
    conda clean -tipsy && \
    fix-permissions $CONDA_DIR && \
    fix-permissions /home/$NB_USER

WORKDIR $SPARK_HOME

然后我运行命令将 my_notebook 图像设置为: docker build -t my_notebook .

然后我制作了三个容器 Master、Worker 和 Notebook 如下:

掌握使用 docker-compose 文件:

master:
      image: my_notebook
      command: bin/spark-class org.apache.spark.deploy.master.Master -h master
      hostname: master
      environment:
        MASTER: spark://master:7077
        SPARK_CONF_DIR: /conf
        SPARK_PUBLIC_DNS: 192.168.XXX.XXX
      expose:
        - 7001
        - 7002
        - 7003
        - 7004
        - 7005
        - 7077
        - 6066
      ports:
        - 4040:4040
        - 6066:6066
        - 7077:7077
        - 8080:8080
      volumes:
        - ./conf/master:/conf
        - ./data:/tmp/data

工人使用 docker-compose 文件:

worker:
      image: my_notebook
      command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://192.168.XXX.XXX:7077
      hostname: worker
      environment:
        SPARK_CONF_DIR: /conf
        SPARK_WORKER_CORES: 4
        SPARK_WORKER_MEMORY: 4g
        SPARK_WORKER_PORT: 8881
        SPARK_WORKER_WEBUI_PORT: 8081
        SPARK_BLOCKMGR_PORT: 5003
        SPARK_PUBLIC_DNS: localhost
      expose:
        - 7012
        - 7013
        - 7014
        - 7015
        - 8881
        - 5001
        - 5003
      ports:
        - 8081:8081
      volumes:
        - ./conf/worker:/conf
        - ./data:/tmp/data

使用 docker-compose 文件的笔记本:

notebook:
  image: my_notebook
  command: jupyter notebook
  hostname: notebook
  environment:
    SPARK_PUBLIC_DNS: 192.168.XXX.XXX
  expose:
    - 7012
    - 7013
    - 7014
    - 7015
    - 8881
    - 8888
  ports:
    - 8888:8888

首先,我在一台机器上启动了 Master 容器: “码头工人组成” 然后在另一台机器“docker-compose up”上启动 Worker 然后在其他机器“docker-compose up”中启动笔记本

Spark 集群已设置。 Spark UI 是可访问的。工人也在集群中注册。Jupyter notebook 也成功启动。但面临的问题是,当我通过 jupyter worker 执行程序运行 pyspark 应用程序时,无法连接回 spark 驱动程序。 错误日志是:

Spark Executor Command: "/usr/lib/jvm/java-8-openjdk-amd64//bin/java" "-cp" "/conf/:/opt/spark/jars/*" "-Xmx1024M" "-Dspark.driver.port=35147" "org.apache.spark.executor.CoarseGrainedExecutorBackend" "--driver-url" "spark://CoarseGrainedScheduler@notebook:35147" "--executor-id" "31" "--hostname" "172.17.0.3" "--cores" "2" "--app-id" "app-20190101134023-0001" "--worker-url" "spark://Worker@172.17.0.3:8881"
========================================

Exception in thread "main" java.lang.reflect.UndeclaredThrowableException
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1713)
    at org.apache.spark.deploy.SparkHadoopUtil.runAsSparkUser(SparkHadoopUtil.scala:63)
    at org.apache.spark.executor.CoarseGrainedExecutorBackend$.run(CoarseGrainedExecutorBackend.scala:188)
    at org.apache.spark.executor.CoarseGrainedExecutorBackend$.main(CoarseGrainedExecutorBackend.scala:293)
    at org.apache.spark.executor.CoarseGrainedExecutorBackend.main(CoarseGrainedExecutorBackend.scala)
Caused by: org.apache.spark.SparkException: Exception thrown in awaitResult: 
    at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:205)
    at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
    at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:101)
    at org.apache.spark.executor.CoarseGrainedExecutorBackend$$anonfun$run$1.apply$mcV$sp(CoarseGrainedExecutorBackend.scala:201)
    at org.apache.spark.deploy.SparkHadoopUtil$$anon$2.run(SparkHadoopUtil.scala:64)
    at org.apache.spark.deploy.SparkHadoopUtil$$anon$2.run(SparkHadoopUtil.scala:63)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:422)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1698)
    ... 4 more
Caused by: java.io.IOException: Failed to connect to notebook:35147
    at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:245)
    at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:187)
    at org.apache.spark.rpc.netty.NettyRpcEnv.createClient(NettyRpcEnv.scala:198)
    at org.apache.spark.rpc.netty.Outbox$$anon$1.call(Outbox.scala:194)
    at org.apache.spark.rpc.netty.Outbox$$anon$1.call(Outbox.scala:190)
    at java.util.concurrent.FutureTask.run(FutureTask.java:266)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.UnknownHostException: notebook
    at java.net.InetAddress.getAllByName0(InetAddress.java:1281)
    at java.net.InetAddress.getAllByName(InetAddress.java:1193)
    at java.net.InetAddress.getAllByName(InetAddress.java:1127)
    at java.net.InetAddress.getByName(InetAddress.java:1077)
    at io.netty.util.internal.SocketUtils$8.run(SocketUtils.java:146)
    at io.netty.util.internal.SocketUtils$8.run(SocketUtils.java:143)
    at java.security.AccessController.doPrivileged(Native Method)
    at io.netty.util.internal.SocketUtils.addressByName(SocketUtils.java:143)
    at io.netty.resolver.DefaultNameResolver.doResolve(DefaultNameResolver.java:43)
    at io.netty.resolver.SimpleNameResolver.resolve(SimpleNameResolver.java:63)
    at io.netty.resolver.SimpleNameResolver.resolve(SimpleNameResolver.java:55)
    at io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:57)
    at io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:32)
    at io.netty.resolver.AbstractAddressResolver.resolve(AbstractAddressResolver.java:108)
    at io.netty.bootstrap.Bootstrap.doResolveAndConnect0(Bootstrap.java:208)
    at io.netty.bootstrap.Bootstrap.access$000(Bootstrap.java:49)
    at io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:188)
    at io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:174)
    at io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:507)
    at io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:481)
    at io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:420)
    at io.netty.util.concurrent.DefaultPromise.trySuccess(DefaultPromise.java:104)
    at io.netty.channel.DefaultChannelPromise.trySuccess(DefaultChannelPromise.java:82)
    at io.netty.channel.AbstractChannel$AbstractUnsafe.safeSetSuccess(AbstractChannel.java:978)
    at io.netty.channel.AbstractChannel$AbstractUnsafe.register0(AbstractChannel.java:512)
    at io.netty.channel.AbstractChannel$AbstractUnsafe.access$200(AbstractChannel.java:423)
    at io.netty.channel.AbstractChannel$AbstractUnsafe$1.run(AbstractChannel.java:482)
    at io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:163)
    at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:403)
    at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:463)
    at io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:858)
    at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:138)
    ... 1 more

谁能帮帮我?

【问题讨论】:

    标签: docker apache-spark


    【解决方案1】:

    为了使notebook:35147 之类的 url 正常工作,容器必须在同一个网络中。在您的情况下,您在不同的机器上启动容器,因此该网络必须是覆盖网络。

    最好的解决方案是使用docker swarmdocker stack 而不是docker-compose,但为了不给你带来新事物的困扰,我们暂时还是坚持写吧。

    首先创建这样一个网络

    我们需要 swarm 模式的一点帮助:

    在一台机器(经理)上执行docker swarm initdocker network create --driver=overlay --attachable my-network

    在其他机器上,docker swarm join 使用你在管理器上获得的令牌。

    然后修改您所有的撰写文件,使其在最后

    networks:
      my-network:
        external: true
    

    并为有问题的服务添加这个

    networks:
       my-network
    

    【讨论】:

    • 或者我也应该在不同的机器上创建覆盖网络?
    • 在哪里添加这些行?在所有 docker-compose 文件中?我有 3 个单独的 docker-compose 文件
    • 很抱歉再次询问其中的服务是什么“并为有问题的网络中的服务添加这个:我的网络”
    • 当我尝试将 worker 与 manager 连接时,它给了我以下来自守护进程的错误消息错误响应:rpc 错误:代码 = 不可用 desc = 所有 SubConns 都处于 TransientFailure,最新连接错误:连接错误:desc =“传输:拨号时出错拨号tcp 192.168.XXX.XXX:2337:连接:连接被拒绝
    猜你喜欢
    • 1970-01-01
    • 2018-01-11
    • 1970-01-01
    • 1970-01-01
    • 2018-05-04
    • 1970-01-01
    • 2016-08-11
    • 2017-06-23
    • 2016-07-24
    相关资源
    最近更新 更多