【发布时间】:2021-08-23 06:42:15
【问题描述】:
我正在尝试获取 Apache Airflow 映像来安装 h2o,通常我只需要运行 pip install h2o 即可。
根据this 的回答,我需要扩展图像,我这样做了。
airflow/Dockerfile:
FROM apache/airflow:2.1.2
USER root
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
build-essential h2o \
&& apt-get autoremove -yqq --purge \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
USER airflow
RUN pip install --no-cache-dir --user h2o
docker-compose.yaml:
---
version: "3"
x-airflow-common:
build: ./airflow
environment: &ref_0
AIRFLOW__CORE__EXECUTOR: CeleryExecutor
AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
AIRFLOW__CORE__FERNET_KEY: ""
AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: "true"
AIRFLOW__CORE__LOAD_EXAMPLES: "true"
AIRFLOW__API__AUTH_BACKEND: airflow.api.auth.backend.basic_auth
_PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID}
AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY}
AWS_SESSION_TOKEN: ${AWS_SESSION_TOKEN}
AWS_ROLE_ARN: ${AWS_ROLE_ARN}
REGION_NAME: ${REGION_NAME}
volumes: &ref_1
- ./dags:/opt/airflow/dags
- ./logs:/opt/airflow/logs
- ./plugins:/opt/airflow/plugins
user: ${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}
depends_on: &ref_2
redis:
condition: service_healthy
postgres:
condition: service_healthy
services:
postgres:
image: postgres:13
environment:
POSTGRES_USER: airflow
POSTGRES_PASSWORD: airflow
POSTGRES_DB: airflow
volumes:
- postgres-db-volume:/var/lib/postgresql/data
healthcheck:
test:
- CMD
- pg_isready
- -U
- airflow
interval: 5s
retries: 5
restart: always
redis:
image: redis:latest
ports:
- 6379:6379
healthcheck:
test:
- CMD
- redis-cli
- ping
interval: 5s
timeout: 30s
retries: 50
restart: always
airflow-webserver:
image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.1.2}
environment: *ref_0
volumes: *ref_1
user: ${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}
depends_on: *ref_2
command: webserver
ports:
- 8080:8080
healthcheck:
test:
- CMD
- curl
- --fail
- http://localhost:8080/health
interval: 10s
timeout: 10s
retries: 5
restart: always
airflow-scheduler:
image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.1.2}
environment: *ref_0
volumes: *ref_1
user: ${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}
depends_on: *ref_2
command: scheduler
healthcheck:
test:
- CMD-SHELL
- airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"
interval: 10s
timeout: 10s
retries: 5
restart: always
airflow-worker:
image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.1.2}
environment: *ref_0
volumes: *ref_1
user: ${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}
depends_on: *ref_2
command: celery worker
healthcheck:
test:
- CMD-SHELL
- >-
celery --app airflow.executors.celery_executor.app inspect ping -d
"celery@$${HOSTNAME}"
interval: 10s
timeout: 10s
retries: 5
restart: always
airflow-init:
image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.1.2}
environment:
AIRFLOW__CORE__EXECUTOR: CeleryExecutor
AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
AIRFLOW__CORE__FERNET_KEY: ""
AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: "true"
AIRFLOW__CORE__LOAD_EXAMPLES: "true"
AIRFLOW__API__AUTH_BACKEND: airflow.api.auth.backend.basic_auth
_PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID}
AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY}
AWS_SESSION_TOKEN: ${AWS_SESSION_TOKEN}
REGION_NAME: ${REGION_NAME}
_AIRFLOW_DB_UPGRADE: "true"
_AIRFLOW_WWW_USER_CREATE: "true"
_AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
_AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
volumes: *ref_1
user: ${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}
depends_on: *ref_2
command: version
flower:
image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.1.2}
environment: *ref_0
volumes: *ref_1
user: ${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}
depends_on: *ref_2
command: celery flower
ports:
- 5555:5555
healthcheck:
test:
- CMD
- curl
- --fail
- http://localhost:5555/
interval: 10s
timeout: 10s
retries: 5
restart: always
volumes:
postgres-db-volume:
当我检查使用docker exec -it <CONTAINER_ID> pip list 安装的包时,找不到h2o,我已经将包包含在requirements.txt 中。
目录结构:
├── airflow
│ └── Dockerfile
├── dags
│ ├── 01_lasic_retraining_overview.py
│ ├── 02_lasic_retraining_sagemaker_autopilot.py
│ ├── 03_lasic_retraining_h20_automl.py
│ ├── __init__.py
│ └── common
│ ├── __init__.py
│ └── helper.py
├── docker-compose.yaml
├── requirements.txt
气流内部的错误确认:
编辑:
我也尝试了this answer,但仍然是同样的问题。
【问题讨论】:
-
请使用语言标签。
-
请确认容器内的 Dockerfile 反映了您所做的更改。我怀疑您可能在没有重新创建映像的情况下更改了外部的 Dockerfile。
-
@AndrewWei 我跑了
docker-compose down -v && docker-compose up -d --build,一旦我做出正确的更改,应该重置一切? -
重建时,您是否看到运行
pip install的步骤?我认为它只会构建正在使用的图像,不一定会运行所有步骤。 -
我连接到容器并运行
pip install h2o并安装它,这表明它在Dockerfile处失败。