python3.6为基础镜像版本,该版本linux系统为debian:
FROM python:3.6
ARG WORK_DIR=/opt
WORKDIR $WORK_DIR
# java
ADD jdk-8u281-linux-x64.tar.gz $WORK_DIR
RUN mv jdk1.8.0_281 jdk
ENV JAVA_HOME $WORK_DIR/jdk
ENV JRE_HOME $JAVA_HOME/jre
ENV CLASSPATH .:$JAVA_HOME/lib:$JRE_HOME/lib
ENV PATH $PATH:$JAVA_HOME/bin
# hadoop
ADD hadoop-2.7.7.tar.gz $WORK_DIR
RUN mv hadoop-2.7.7 hadoop
RUN mkdir -p /home/hadoop/tmp /home/hadoop/dfs/name /home/hadoop/dfs/name
ENV HADOOP_HOME $WORK_DIR/hadoop
ENV PATH $PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
COPY core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml
COPY hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml
COPY mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml
COPY yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
COPY slaves $HADOOP_HOME/etc/hadoop/slaves
RUN echo export JAVA_HOME=$JAVA_HOME >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh
# spark
ADD spark-2.4.7-bin-hadoop2.7.tgz $WORK_DIR
RUN mv spark-2.4.7-bin-hadoop2.7 spark
ENV SPARK_HOME /opt/spark
ENV PATH $PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
RUN echo export JAVA_HOME=$JAVA_HOME >> $SPARK_HOME/conf/spark-env.sh
COPY slaves $SPARK_HOME/conf/slaves
RUN sed -i s@deb.debian.org@mirrors.aliyun.com@g /etc/apt/sources.list
RUN sed -i s@security.debian.org@mirrors.aliyun.com@g /etc/apt/sources.list
RUN apt-get update && apt-get install -y openssh-server
RUN ssh-keygen -t rsa -f ~/.ssh/id_rsa && \
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
chmod 600 ~/.ssh/authorized_keys
RUN echo ' StrictHostKeyChecking no' >> /etc/ssh/ssh_config
COPY apache-livy-0.7.0-incubating-bin.zip $WORK_DIR
RUN cd $WORK_DIR && unzip apache-livy-0.7.0-incubating-bin.zip && \
rm -f apache-livy-0.7.0-incubating-bin.zip && \
mv apache-livy-0.7.0-incubating-bin livy
ENV HADOOP_CONF_DIR $HADOOP_HOME/etc/hadoop
ENV PATH $PATH:$WORK_DIR/livy/bin
RUN mkdir /var/run/sshd
EXPOSE 22
CMD ["/usr/sbin/sshd", "-D"]
centos版本,该版本需要自己手动安装python,现在提供的这个未安装python,后续有时间把python安装补充更新进去:
FROM centos
ARG WORK_DIR=/opt
# java
ADD jdk-8u281-linux-x64.tar.gz $WORK_DIR
RUN mv $WORK_DIR/jdk1.8.0_281 $WORK_DIR/jdk
ENV JAVA_HOME $WORK_DIR/jdk
ENV JRE_HOME $JAVA_HOME/jre
ENV CLASSPATH .:$JAVA_HOME/lib:$JRE_HOME/lib
ENV PATH $PATH:$JAVA_HOME/bin
# hadoop
ADD hadoop-2.7.7.tar.gz $WORK_DIR
RUN mv $WORK_DIR/hadoop-2.7.7 $WORK_DIR/hadoop
RUN mkdir -p /home/hadoop/tmp /home/hadoop/dfs/name /home/hadoop/dfs/name
ENV HADOOP_HOME $WORK_DIR/hadoop
ENV PATH $PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
COPY core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml
COPY hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml
COPY mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml
COPY yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
COPY slaves $HADOOP_HOME/etc/hadoop/slaves
RUN echo export JAVA_HOME=$JAVA_HOME >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh
# spark
ADD spark-2.4.7-bin-hadoop2.7.tgz $WORK_DIR
RUN mv $WORK_DIR/spark-2.4.7-bin-hadoop2.7 $WORK_DIR/spark
ENV SPARK_HOME /opt/spark
ENV PATH $PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
RUN echo export JAVA_HOME=$JAVA_HOME >> $SPARK_HOME/conf/spark-env.sh
COPY slaves $SPARK_HOME/conf/slaves
RUN yum install -y openssh-server openssh-clients which zip
RUN ssh-keygen -t dsa -f /etc/ssh/ssh_host_dsa_key
RUN ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key
RUN ssh-keygen -t ecdsa -f /etc/ssh/ssh_host_ecdsa_key
RUN ssh-keygen -t dsa -f /etc/ssh/ssh_host_ed25519_key
RUN ssh-keygen -t rsa -f ~/.ssh/id_rsa && \
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
chmod 600 ~/.ssh/authorized_keys
RUN echo -e 'Host *\n StrictHostKeyChecking no' > /etc/ssh/ssh_config.d/default.conf
COPY apache-livy-0.7.0-incubating-bin.zip $WORK_DIR
RUN cd $WORK_DIR && unzip apache-livy-0.7.0-incubating-bin.zip && \
rm -f apache-livy-0.7.0-incubating-bin.zip && \
mv apache-livy-0.7.0-incubating-bin livy
ENV HADOOP_CONF_DIR $HADOOP_HOME/etc/hadoop
ENV PATH $PATH:$WORK_DIR/livy/bin
WORKDIR $WORK_DIR
RUN mkdir /var/run/sshd
EXPOSE 22
CMD ["/usr/sbin/sshd", "-D"]
docker-compose.yml
version: '3.7'
services:
master:
image: spark
restart: unless-stopped
ports:
- 8080:8080
- 8088:8088
- 9000:9000
- 8998:8998
slave1:
image: spark
restart: unless-stopped
slave2:
image: spark
restart: unless-stopped
core-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://master:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>file:/opt/hadoop/tmp</value>
</property>
</configuration>
hdfs-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>master:9001</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/home/hadoop/dfs/name</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/home/hadoop/dfs/data</value>
</property>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
</configuration>
mapred-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
yarn-site.xml
<?xml version="1.0"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<configuration>
<!-- Site specific YARN configuration properties -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 指定resourcemanager组件在哪个机子上跑 -->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>master</value>
</property>
<property>
<name>yarn.nodemanager.auxservices.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>yarn.resourcemanager.address</name>
<value>master:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address</name>
<value>master:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address</name>
<value>master:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address</name>
<value>master:8033</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address</name>
<value>master:8088</value>
</property>
<!--启用日志聚集功能-->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
</configuration>
关于以上这四个文件的配置内容,可自行查阅根据需要配置
slaves
# 这个对应的是三个节点hostname,因为我使用docker-compose启动,默认容器的hostname就是docker-compose.yml文件中的services
master
slave1
slave2
compose启动之后进入master节点执行以下命令
# 依次为启动hadoop, spark, livy
/opt/hadoop/sbin/start-all.sh
/opt/spark/sbin/start-all.sh
livy-server start
参考链接:
https://blog.csdn.net/qq_39494664/article/details/106001216