1. 架构
### 1、Hadoop
# 下载地址:https://dlcdn.apache.org/hadoop/common/
wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.5/hadoop-3.3.5.tar.gz --no-check-certificate
### 2、hive
# 下载地址:http://archive.apache.org/dist/hive
wget http://archive.apache.org/dist/hive/hive-3.1.3/apache-hive-3.1.3-bin.tar.gz
### 2、spark
# Spark下载地址:http://spark.apache.org/downloads.html
wget https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz --no-check-certificate
### 3、flink
wget https://dlcdn.apache.org/flink/flink-1.17.0/flink-1.17.0-bin-scala_2.12.tgz --no-check-certificate
2. hadoop集群部署(非高可用)
1) 下载所需的二进制包
### 1、Hadoop
# 下载地址:https://dlcdn.apache.org/hadoop/common/
wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.5/hadoop-3.3.5.tar.gz --no-check-certificate
### 2、hive
# 下载地址:http://archive.apache.org/dist/hive
wget http://archive.apache.org/dist/hive/hive-3.1.3/apache-hive-3.1.3-bin.tar.gz
### 2、spark
# Spark下载地址:http://spark.apache.org/downloads.html
wget https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz --no-check-certificate
### 3、flink
wget https://dlcdn.apache.org/flink/flink-1.17.0/flink-1.17.0-bin-scala_2.12.tgz --no-check-certificate
2)Dockerfile文件
FROM centos:7.9
RUN rm -f /etc/localtime && \
ln -sv /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
echo "Asia/Shanghai" > /etc/timezone
RUN export LANG=zh_CN.UTF-8
# 创建用户和用户组,跟yaml编排里的user: 10000:10000
RUN groupadd --system --gid=10000 hadoop && useradd --system --home-dir /home/hadoop --uid=10000 --gid=hadoop hadoop
# 安装sudo和常用工具
RUN yum -y install sudo net-tools telnet wget nc curl ; chmod 640 /etc/sudoers
# 给hadoop添加sudo权限
RUN echo "hadoop ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
RUN mkdir /opt/apache/
# 安装 JDK
ADD jdk-8u212-linux-x64.tar.gz /opt/apache/
ENV JAVA_HOME /opt/apache/jdk1.8.0_212
ENV PATH $JAVA_HOME/bin:$PATH
# 配置 Hadoop
ENV HADOOP_VERSION 3.3.5
ADD hadoop-${HADOOP_VERSION}.tar.gz /opt/apache/
ENV HADOOP_HOME /opt/apache/hadoop
RUN ln -s /opt/apache/hadoop-${HADOOP_VERSION} $HADOOP_HOME
ENV HADOOP_COMMON_HOME=${HADOOP_HOME} \
HADOOP_HDFS_HOME=${HADOOP_HOME} \
HADOOP_MAPRED_HOME=${HADOOP_HOME} \
HADOOP_YARN_HOME=${HADOOP_HOME} \
HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop \
PATH=${PATH}:${HADOOP_HOME}/bin
# 配置Hive
ENV HIVE_VERSION 3.1.3
ADD apache-hive-${HIVE_VERSION}-bin.tar.gz /opt/apache/
ENV HIVE_HOME=/opt/apache/hive
ENV PATH=$HIVE_HOME/bin:$PATH
RUN ln -s /opt/apache/apache-hive-${HIVE_VERSION}-bin ${HIVE_HOME}
# 配置spark
ENV SPARK_VERSION 3.3.2
ADD spark-${SPARK_VERSION}-bin-hadoop3.tgz /opt/apache/
ENV SPARK_HOME=/opt/apache/spark
ENV PATH=$SPARK_HOME/bin:$PATH
RUN ln -s /opt/apache/spark-${SPARK_VERSION}-bin-hadoop3 ${SPARK_HOME}
# 配置 flink
ENV FLINK_VERSION 1.17.0
ADD flink-${FLINK_VERSION}-bin-scala_2.12.tgz /opt/apache/
ENV FLINK_HOME=/opt/apache/flink
ENV PATH=$FLINK_HOME/bin:$PATH
RUN ln -s /opt/apache/flink-${FLINK_VERSION} ${FLINK_HOME}
# 创建namenode、datanode存储目录
RUN mkdir -p /opt/apache/hadoop/data/{hdfs,yarn} /opt/apache/hadoop/data/hdfs/namenode /opt/apache/hadoop/data/hdfs/datanode/data{1..3} /opt/apache/hadoop/data/yarn/{local-dirs,log-dirs,apps}
COPY bootstrap.sh /opt/apache/
COPY config/hadoop-config/* ${HADOOP_HOME}/etc/hadoop/
RUN chown -R hadoop:hadoop /opt/apache
ENV ll "ls -l"
WORKDIR /opt/apache
3)构建镜像
docker build -t hadoop:v1 . --no-cache
### 参数解释
# -t:指定镜像名称
# . :当前目录Dockerfile
# -f:指定Dockerfile路径
# --no-cache:不缓存
4)准备所需的配置文件、脚本等
Hadoop 配置
主要有以下几个文件:core-site.xml、dfs.hosts、dfs.hosts.exclude、hdfs-site.xml、mapred-site.xml、yarn-hosts-exclude、yarn-hosts-include、yarn-site.xml
Hive 配置
主要有以下几个文件:hive-env.sh、hive-site.xml
cat > .env << EOF
HADOOP_HDFS_NN_PORT=9870
HADOOP_HDFS_DN_PORT=9864
HADOOP_YARN_RM_PORT=8088
HADOOP_YARN_NM_PORT=8042
HADOOP_YARN_PROXYSERVER_PORT=9111
HADOOP_MR_HISTORYSERVER_PORT=19888
EOF
启动脚本bootstrap.sh:
# bootstrap.sh
#!/usr/bin/env sh
wait_for() {
echo Waiting for $1 to listen on $2...
while ! nc -z $1 $2; do echo waiting...; sleep 1s; done
}
start_hdfs_namenode() {
if [ ! -f /tmp/namenode-formated ];then
${HADOOP_HOME}/bin/hdfs namenode -format >/tmp/namenode-formated
fi
${HADOOP_HOME}/bin/hdfs --loglevel INFO --daemon start namenode
tail -f ${HADOOP_HOME}/logs/*namenode*.log
}
start_hdfs_datanode() {
wait_for $1 $2
${HADOOP_HOME}/bin/hdfs --loglevel INFO --daemon start datanode
tail -f ${HADOOP_HOME}/logs/*datanode*.log
}
start_yarn_resourcemanager() {
${HADOOP_HOME}/bin/yarn --loglevel INFO --daemon start resourcemanager
tail -f ${HADOOP_HOME}/logs/*resourcemanager*.log
}
start_yarn_nodemanager() {
wait_for $1 $2
${HADOOP_HOME}/bin/yarn --loglevel INFO --daemon start nodemanager
tail -f ${HADOOP_HOME}/logs/*nodemanager*.log
}
start_yarn_proxyserver() {
wait_for $1 $2
${HADOOP_HOME}/bin/yarn --loglevel INFO --daemon start proxyserver
tail -f ${HADOOP_HOME}/logs/*proxyserver*.log
}
start_mr_historyserver() {
wait_for $1 $2
${HADOOP_HOME}/bin/mapred --loglevel INFO --daemon start historyserver
tail -f ${HADOOP_HOME}/logs/*historyserver*.log
}
case $1 in
hadoop-hdfs-nn)
start_hdfs_namenode
;;
hadoop-hdfs-dn)
start_hdfs_datanode $2 $3
;;
hadoop-yarn-rm)
start_yarn_resourcemanager
;;
hadoop-yarn-nm)
start_yarn_nodemanager $2 $3
;;
hadoop-yarn-proxyserver)
start_yarn_proxyserver $2 $3
;;
hadoop-mr-historyserver)
start_mr_historyserver $2 $3
;;
*)
echo "请输入正确的服务启动命令~"
;;
esac
5)docker-compose.yml文件
version: '3'
services:
hadoop-hdfs-nn:
image: hadoop:v1
user: "hadoop:hadoop"
container_name: hadoop-hdfs-nn
hostname: hadoop-hdfs-nn
restart: always
env_file:
- .env
ports:
- "30070:${HADOOP_HDFS_NN_PORT}"
command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-hdfs-nn"]
networks:
- hadoop_network
healthcheck:
test: ["CMD-SHELL", "curl --fail http://localhost:${HADOOP_HDFS_NN_PORT} || exit 1"]
interval: 10s
timeout: 5s
retries: 3
hadoop-hdfs-dn-0:
image: hadoop:v1
user: "hadoop:hadoop"
container_name: hadoop-hdfs-dn-0
hostname: hadoop-hdfs-dn-0
restart: always
depends_on:
- hadoop-hdfs-nn
env_file:
- .env
ports:
- "30864:${HADOOP_HDFS_DN_PORT}"
command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-hdfs-dn hadoop-hdfs-nn ${HADOOP_HDFS_NN_PORT}"]
networks:
- hadoop_network
healthcheck:
test: ["CMD-SHELL", "curl --fail http://localhost:${HADOOP_HDFS_DN_PORT} || exit 1"]
interval: 10s
timeout: 5s
retries: 3
hadoop-hdfs-dn-1:
image: hadoop:v1
user: "hadoop:hadoop"
container_name: hadoop-hdfs-dn-1
hostname: hadoop-hdfs-dn-1
restart: always
depends_on:
- hadoop-hdfs-nn
env_file:
- .env
ports:
- "30865:${HADOOP_HDFS_DN_PORT}"
command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-hdfs-dn hadoop-hdfs-nn ${HADOOP_HDFS_NN_PORT}"]
networks:
- hadoop_network
healthcheck:
test: ["CMD-SHELL", "curl --fail http://localhost:${HADOOP_HDFS_DN_PORT} || exit 1"]
interval: 10s
timeout: 5s
retries: 3
hadoop-hdfs-dn-2:
image: hadoop:v1
user: "hadoop:hadoop"
container_name: hadoop-hdfs-dn-2
hostname: hadoop-hdfs-dn-2
restart: always
depends_on:
- hadoop-hdfs-nn
env_file:
- .env
ports:
- "30866:${HADOOP_HDFS_DN_PORT}"
command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-hdfs-dn hadoop-hdfs-nn ${HADOOP_HDFS_NN_PORT}"]
networks:
- hadoop_network
healthcheck:
test: ["CMD-SHELL", "curl --fail http://localhost:${HADOOP_HDFS_DN_PORT} || exit 1"]
interval: 10s
timeout: 5s
retries: 3
hadoop-yarn-rm:
image: hadoop:v1
user: "hadoop:hadoop"
container_name: hadoop-yarn-rm
hostname: hadoop-yarn-rm
restart: always
env_file:
- .env
ports:
- "30888:${HADOOP_YARN_RM_PORT}"
command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-yarn-rm"]
networks:
- hadoop_network
healthcheck:
test: ["CMD-SHELL", "netstat -tnlp|grep :${HADOOP_YARN_RM_PORT} || exit 1"]
interval: 10s
timeout: 5s
retries: 3
hadoop-yarn-nm-0:
image: hadoop:v1
user: "hadoop:hadoop"
container_name: hadoop-yarn-nm-0
hostname: hadoop-yarn-nm-0
restart: always
depends_on:
- hadoop-yarn-rm
env_file:
- .env
ports:
- "30042:${HADOOP_YARN_NM_PORT}"
command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-yarn-nm hadoop-yarn-rm ${HADOOP_YARN_RM_PORT}"]
networks:
- hadoop_network
healthcheck:
test: ["CMD-SHELL", "curl --fail http://localhost:${HADOOP_YARN_NM_PORT} || exit 1"]
interval: 10s
timeout: 5s
retries: 3
hadoop-yarn-nm-1:
image: hadoop:v1
user: "hadoop:hadoop"
container_name: hadoop-yarn-nm-1
hostname: hadoop-yarn-nm-1
restart: always
depends_on:
- hadoop-yarn-rm
env_file:
- .env
ports:
- "30043:${HADOOP_YARN_NM_PORT}"
command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-yarn-nm hadoop-yarn-rm ${HADOOP_YARN_RM_PORT}"]
networks:
- hadoop_network
healthcheck:
test: ["CMD-SHELL", "curl --fail http://localhost:${HADOOP_YARN_NM_PORT} || exit 1"]
interval: 10s
timeout: 5s
retries: 3
hadoop-yarn-nm-2:
image: hadoop:v1
user: "hadoop:hadoop"
container_name: hadoop-yarn-nm-2
hostname: hadoop-yarn-nm-2
restart: always
depends_on:
- hadoop-yarn-rm
env_file:
- .env
ports:
- "30044:${HADOOP_YARN_NM_PORT}"
command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-yarn-nm hadoop-yarn-rm ${HADOOP_YARN_RM_PORT}"]
networks:
- hadoop_network
healthcheck:
test: ["CMD-SHELL", "curl --fail http://localhost:${HADOOP_YARN_NM_PORT} || exit 1"]
interval: 10s
timeout: 5s
retries: 3
hadoop-yarn-proxyserver:
image: hadoop:v1
user: "hadoop:hadoop"
container_name: hadoop-yarn-proxyserver
hostname: hadoop-yarn-proxyserver
restart: always
depends_on:
- hadoop-yarn-rm
env_file:
- .env
ports:
- "30911:${HADOOP_YARN_PROXYSERVER_PORT}"
command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-yarn-proxyserver hadoop-yarn-rm ${HADOOP_YARN_RM_PORT}"]
networks:
- hadoop_network
healthcheck:
test: ["CMD-SHELL", "netstat -tnlp|grep :${HADOOP_YARN_PROXYSERVER_PORT} || exit 1"]
interval: 10s
timeout: 5s
retries: 3
hadoop-mr-historyserver:
image: hadoop:v1
user: "hadoop:hadoop"
container_name: hadoop-mr-historyserver
hostname: hadoop-mr-historyserver
restart: always
depends_on:
- hadoop-yarn-rm
env_file:
- .env
ports:
- "31988:${HADOOP_MR_HISTORYSERVER_PORT}"
command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-mr-historyserver hadoop-yarn-rm ${HADOOP_YARN_RM_PORT}"]
networks:
- hadoop_network
healthcheck:
test: ["CMD-SHELL", "netstat -tnlp|grep :${HADOOP_MR_HISTORYSERVER_PORT} || exit 1"]
interval: 10s
timeout: 5s
retries: 3
networks:
hadoop_network:
driver: bridge
注意:
-
如果是不同的compose文件生成的容器,如果不指定一样的network,它们直接是不能通过主机名访问的。
-
depends_on
只能决定容器的启动先后顺序,无法决定容器里服务的启动顺序,作用不大,所以在上面bootstrap.sh
脚本里加上一个wait_for
函数来真正控制服务的启动顺序
6)访问检查验证
HDFS:http://ip:30070/
YARN:http://ip:30070/