@yangwenbo
2022-07-08T11:49:32.000000Z
字数 25545
阅读 291
清华大学-FIB实验室
名称 | IP | 说明 | jdk版本 | hadoop版本 | zookeeper版本 | spark版本 | 操作系统 |
---|---|---|---|---|---|---|---|
spark01 | 192.168.200.43 | Master节点 | 1.8 | 3.2.0 | 3.7.0 | 3.2.0 | ubuntu22.04 |
spark02 | 192.168.200.44 | Slave1节点 | 1.8 | 3.2.0 | 3.7.0 | 3.2.0 | ubuntu22.04 |
spark03 | 192.168.200.45 | Slave2节点 | 1.8 | 3.2.0 | 3.7.0 | 3.2.0 | ubuntu22.04 |
spark04 | 192.168.200.46 | Slave3节点 | 1.8 | 3.2.0 | / | 3.2.0 | ubuntu22.04 |
#添加主机名映射
root@spark01:~# vim /etc/hosts
root@spark01:~# tail -4 /etc/hosts
192.168.200.43 spark01
192.168.200.44 spark02
192.168.200.45 spark03
192.168.200.46 spark04
#创建hadoop用户
root@spark01:~# adduser hadoop --home /home/hadoop
#为新建的hadoop增加管理员权限
root@spark01:~# adduser hadoop sudo
#磁盘格式化
root@spark01:~$ mkfs.ext4 /dev/sdb
root@spark01:~$ mkfs.ext4 /dev/sdc
#创建挂载目录
root@spark01:~# mkdir /data1
root@spark01:~# mkdir /data2
#挂载磁盘
root@spark01:~# mount /dev/sdb /data1/
root@spark01:~# mount /dev/sdc /data2/
#修改目录属组、属主
root@spark01:~# chown hadoop:hadoop /data1
root@spark01:~# chown hadoop:hadoop /data2
#加入开机自挂载
root@spark01:~# vim /etc/fstab
root@spark01:~# tail -2 /etc/fstab
/dev/sdb /data1 ext4 defaults 0 0
/dev/sdc /data2 ext4 defaults 0 0
集群、单节点模式都需要用到 SSH 登陆(类似于远程登陆,你可以登录某台 Linux 主机,并且在上面运行命令),Ubuntu 默认已安装了 SSH client,此外还需要安装 SSH server:
#安装openssh-server
hadoop@spark01:~$ sudo apt-get -y install openssh-server
#生成密匙
hadoop@spark01:~$ ssh-keygen -t rsa
#分发公匙
hadoop@spark01:~$ ssh-copy-id 192.168.200.43
hadoop@spark01:~$ ssh-copy-id 192.168.200.44
hadoop@spark01:~$ ssh-copy-id 192.168.200.45
hadoop@spark01:~$ ssh-copy-id 192.168.200.46
进到spark官网
在这需要注意选择合适的安装包,因为我们使用Spark的时候一般都是需要和Hadoop交互的,所以需要下载带有Hadoop依赖的安装包
但是这里面只有Spark的最新版本,一般不建议选择最新版本,可以在最新版本往下面回退一两个小版本
这个时候就需要选择Hadoop版本对应的Spark安装包,里面Hadoop的版本只有2.7和3.2的,这里选择版本较高的hadoop3.2对应的这个Spark安装包。
hadoop的版本也要跟spark相对应,去hadoop官网选择hadoop3.2
这里选择zookeeper作为高可用,进入zookeeper官网下载一个版本
#解压JDK
hadoop@spark01:~$ sudo tar xf jdk-8u162-linux-x64.tar.gz -C /usr/local/
hadoop@spark01:~$ sudo mv /usr/local/jdk1.8.0_162 /usr/local/jdk
#添加环境变量
hadoop@spark01:~$ vim ~/.bashrc
hadoop@spark01:~$ tail -4 ~/.bashrc
#jdk
export JAVA_HOME=/usr/local/jdk
export PATH=$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$PATH
export CLASSPATH=.$CLASSPATH:$JAVA_HOME/lib:$JAVA_HOME/lib/tools.jar
#使环境变量立即生效
hadoop@spark01:~$ source ~/.bashrc
#检查java是否安装成功
hadoop@hadoop-spark01:~$ java -version
java version "1.8.0_162"
Java(TM) SE Runtime Environment (build 1.8.0_162-b12)
Java HotSpot(TM) 64-Bit Server VM (build 25.162-b12, mixed mode)
#解压zookeeper
hadoop@spark01:~$ sudo tar xf apache-zookeeper-3.7.0-bin.tar.gz -C /usr/local/
hadoop@spark01:~$ sudo mv /usr/local/apache-zookeeper-3.7.0-bin /usr/local/zookeeper
hadoop@spark01:~$ sudo chown -R hadoop:hadoop /usr/local/zookeeper
#创建zookeeper数据目录以及日志存放目录
hadoop@spark01:~$ mkdir -p /usr/local/zookeeper/data
hadoop@spark01:~$ mkdir -p /usr/local/zookeeper/logs
#复制 zoo_sample.cfg 文件的并命名为为 zoo.cfg,真正用的的是zoo.cfg
hadoop@spark01:~$ cd /usr/local/zookeeper/conf/
hadoop@spark01:/usr/local/zookeeper/conf$ cat zoo_sample.cfg | egrep -v "^$|^#" >zoo.cfg
hadoop@spark01:/usr/local/zookeeper/conf$ vim zoo.cfg
hadoop@spark01:/usr/local/zookeeper/conf$ cat zoo.cfg
#通信心跳数,Zookeeper服务器与客户端心跳时间,单位毫秒
tickTime=2000
#LF初始通信时限
initLimit=10
#LF同步通信时限
syncLimit=5
#数据文件夹
dataDir=/usr/local/zookeeper/data
#日志文件夹
dataLogDir=/usr/local/zookeeper/logs
#开启四字命令(版本)
4lw.commands.whitelist=*
#客户端连接端口
clientPort=2181
#配置集群
server.1=192.168.200.43:2888:3888
server.2=192.168.200.44:2888:3888
server.3=192.168.200.45:2888:3888
# 【注意!!!】每台机器上的 myid 是不一样的,server.2 对应的要 echo 2
hadoop@spark01:/usr/local/zookeeper/conf$ echo 1 > /usr/local/zookeeper/data/myid
hadoop@spark02:/usr/local/zookeeper/conf$ echo 2 > /usr/local/zookeeper/data/myid
hadoop@spark03:/usr/local/zookeeper/conf$ echo 3 > /usr/local/zookeeper/data/myid
hadoop@spark01:/usr/local/zookeeper/conf$ cd /usr/local/zookeeper/bin/
hadoop@spark01:/usr/local/zookeeper/bin$ vim zkEnv.sh
hadoop@spark01:/usr/local/zookeeper/bin$ sed -n "32p" zkEnv.sh
export JAVA_HOME=/usr/local/jdk
hadoop@spark01:/usr/local/zookeeper/bin$ cd /etc/init.d/
hadoop@spark01:/etc/init.d$ sudo vim zookeeper
hadoop@spark01:/etc/init.d$ cat zookeeper
#!/bin/bash
#chkconfig:2345 20 90
#description:zookeeper
#processname:zookeeper
export JAVA_HOME=/usr/local/jdk
case $1 in
start) sudo /usr/local/zookeeper/bin/zkServer.sh start;;
stop) sudo /usr/local/zookeeper/bin/zkServer.sh stop;;
status) sudo /usr/local/zookeeper/bin/zkServer.sh status;;
restart) sudo /usr/local/zookeeper/bin/zkServer.sh restart;;
*) echo "require start|stop|status|restart" ;;
esac
#添加权限
hadoop@spark01:/etc/init.d$ sudo chmod +x zookeeper
#启动zookeeper
hadoop@spark01:/etc/init.d$ service zookeeper start
#查看监听端口
hadoop@spark01:~$ netstat -anp | grep 2181
tcp6 0 0 :::2181 :::* LISTEN
#安装hadoop
hadoop@spark01:~$ sudo tar xf hadoop-3.2.0.tar.gz -C /usr/local/
hadoop@spark01:~$ sudo mv /usr/local/hadoop-3.2.0 /usr/local/hadoop
hadoop@spark01:~$ sudo chown -R hadoop:hadoop /usr/local/hadoop
hadoop@spark01:~$ sudo chmod -R g+w /usr/local/hadoop/
#验证Hadoop是否可用
hadoop@spark01:~$ /usr/local/hadoop/bin/hadoop version
Hadoop 3.2.0
Source code repository https://github.com/apache/hadoop.git -r e97acb3bd8f3befd27418996fa5d4b50bf2e17bf
Compiled by sunilg on 2019-01-08T06:08Z
Compiled with protoc 2.5.0
From source with checksum d3f0795ed0d9dc378e2c785d3668f39
This command was run using /usr/local/hadoop/share/hadoop/common/hadoop-common-3.2.0.jar
#添加环境变量
hadoop@spark01:~$ vim ~/.bashrc
hadoop@spark01:~$ tail -2 ~/.bashrc
#hadoop
export PATH=$PATH:/usr/local/hadoop/bin:/usr/local/hadoop/sbin
#使环境变量立即生效
hadoop@spark01:~$ source ~/.bashrc
在配置集群/分布式模式时,需要修改/usr/local/hadoop/etc/hadoop
目录下的配置文件,这里仅设置正常启动所必须的设置项,包括workers 、core-site.xml、hdfs-site.xml、mapred-site.xml、yarn-site.xml共5个文件。
hadoop@spark01:~$ cd /usr/local/hadoop/etc/hadoop/
hadoop@spark01:/usr/local/hadoop/etc/hadoop$ vim workers
hadoop@spark01:/usr/local/hadoop/etc/hadoop$ cat workers
spark01
spark02
spark03
spark04
请把core-site.xml文件修改为如下内容:
spark04hadoop@spark01:/usr/local/hadoop/etc/hadoop$ nano core-site.xml
hadoop@spark01:/usr/local/hadoop/etc/hadoop$ cat core-site.xml
......
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://ns1</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/usr/local/hadoop/tmp</value>
</property>
<property>
<name>hadoop.http.staticuser.user</name>
<value>hadoop</value>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>spark01:2181,spark02:2181,spark03:2181</value>
</property>
<property>
<name>hadoop.proxyuser.hadoop.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hadoop.groups</name>
<value>*</value>
</property>
<!--设置用户组默认权限-->
<property>
<name>fs.permissions.umask-mode</name>
<value>002</value>
</property>
</configuration>
hadoop@spark01:/usr/local/hadoop/etc/hadoop$ nano hdfs-site.xml
hadoop@spark01:/usr/local/hadoop/etc/hadoop$ cat hdfs-site.xml
......
<configuration>
<property>
<name>dfs.ha.automatic-failover.enabled.ns</name>
<value>true</value>
</property>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<property>
<name>dfs.nameservices</name>
<value>ns1</value>
</property>
<property>
<name>dfs.blocksize</name>
<value>134217728</value>
</property>
<property>
<name>dfs.ha.namenodes.ns1</name>
<value>nn1,nn2</value>
</property>
<!-- nn1的RPC通信地址,nn1所在地址 -->
<property>
<name>dfs.namenode.rpc-address.ns1.nn1</name>
<value>spark01:9000</value>
</property>
<!-- nn1的http通信地址,外部访问地址 -->
<property>
<name>dfs.namenode.http-address.ns1.nn1</name>
<value>spark01:50070</value>
</property>
<!-- nn2的RPC通信地址,nn2所在地址 -->
<property>
<name>dfs.namenode.rpc-address.ns1.nn2</name>
<value>spark02:9000</value>
</property>
<!-- nn2的http通信地址,外部访问地址 -->
<property>
<name>dfs.namenode.http-address.ns1.nn2</name>
<value>spark02:50070</value>
</property>
<!-- 指定NameNode的元数据在JournalNode日志上的存放位置(一般和zookeeper部署在一起) -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://spark01:8485;spark02:8485/ns1</value>
</property>
<!-- 指定JournalNode在本地磁盘存放数据的位置 -->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/usr/local/hadoop/journaldata</value>
</property>
<!--开启namenode失败自动切换-->
<property>
<name>dfs.ha.automatic-failover.enabled.ns1</name>
<value>true</value>
</property>
<!--客户端通过代理访问namenode,访问文件系统,HDFS 客户端与Active 节点通信的Java 类,使用其确定Active 节点是否活跃 -->
<property>
<name>dfs.client.failover.proxy.provider.ns1</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!--这是配置自动切换的方法,有多种使用方法,具体可以看官网,在文末会给地址,这里是远程登录杀死的方法 -->
<property>
<name>dfs.ha.fencing.methods</name>
<value>
sshfence
shell(/bin/true)
</value>
</property>
<!-- 这个是使用sshfence隔离机制时才需要配置ssh免登陆 -->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/home/hadoop/.ssh/id_rsa</value>
</property>
<!-- 配置sshfence隔离机制超时时间,这个属性同上,如果你是用脚本的方法切换,这个应该是可以不配置的 -->
<property>
<name>dfs.ha.fencing.ssh.connect-timeout</name>
<value>30000</value>
</property>
<!-- 这个是开启自动故障转移,如果你没有自动故障转移,这个可以先不配 -->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<!-- HDFS文件系统元信息保存目录 -->
<property>
<name>dfs.name.dir</name>
<value>file:/data1/name,/data2/name</value>
</property>
<!-- HDFS文件系统数据保存目录 -->
<property>
<name>dfs.data.dir</name>
<value>file:/data1/data,/data2/data</value>
</property>
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
<!--系统空间容量预留-->
<property>
<name>dfs.datanode.du.reserved</name>
<value>1073741824</value>
</property>
<!--HDFS 副本存放磁盘负载-->
<property>
<name>dfs.datanode.fsdataset.volume.choosing.policy</name>
<value>org.apache.hadoop.hdfs.server.datanode.fsdataset.AvailableSpaceVolumeChoosingPolicy</value>
</property>
<!--是否在HDFS中开启权限检查-->
<property>
<name>dfs.permissions.enabled</name>
<value>true</value>
</property>
<!--是否在hdfs开启acl,默认为false-->
<property>
<name>dfs.namenode.acls.enabled</name>
<value>true</value>
</property>
</configuration>
把mapred-site.xml文件配置成如下内容:
hadoop@spark01:/usr/local/hadoop/etc/hadoop$ nano mapred-site.xml
hadoop@spark01:/usr/local/hadoop/etc/hadoop$ cat mapred-site.xml
......
<configuration>
<!--告诉hadoop以后MR(Map/Reduce)运行在YARN上-->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>yarn.application.classpath</name>
<value>/usr/local/hadoop/etc/hadoop/mapreduce/*,/usr/local/hadoop/etc/hadoop/mapreduce/lib/*</value>
</property>
</configuration>
请把yarn-site.xml文件配置成如下内容:
hadoop@spark01:/usr/local/hadoop/etc/hadoop$ nano yarn-site.xml
hadoop@spark01:/usr/local/hadoop/etc/hadoop$ cat yarn-site.xml
......
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- Site specific YARN configuration properties -->
<!--启用resourcemanager ha-->
<!--是否开启RM ha,默认是开启的-->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<!-- 指定resourcemanager的名字 -->
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>yrc</value>
</property>
<!-- 使用了2个resourcemanager,分别指定Resourcemanager的地址 -->
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<!-- 指定rm1的地址 -->
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>spark01</value>
</property>
<!-- 指定rm2的地址 -->
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>spark02</value>
</property>
<property>
<name>yarn.application.classpath</name>
<value>/usr/local/hadoop/etc/hadoop:/usr/local/hadoop/share/hadoop/common/lib/*:/usr/local/hadoop/share/hadoop/common/*:/usr/local/hadoop/share/hadoop/hdfs:/usr/local/hadoop/share/hadoop/hdfs/lib/*:/usr/local/hadoop/share/hadoop/hdfs/*:/usr/local/hadoop/share/hadoop/mapreduce/lib/*:/usr/local/hadoop/share/hadoop/mapreduce/*:/usr/local/hadoop/share/hadoop/yarn:/usr/local/hadoop/share/hadoop/yarn/lib/*:/usr/local/hadoop/share/hadoop/yarn/*</value>
</property>
<!--指定zookeeper集群的地址-->
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>spark01:2181,spark02:2181,spark03:2181</value>
</property>
<!--启用自动恢复,当任务进行一半,rm坏掉,就要启动自动恢复,默认是false-->
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
</property>
<!-- NodeManager上运行的附属服务,默认是mapreduce_shuffle -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm1</name>
<value>spark01:8032</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm2</name>
<value>spark02:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm2</name>
<value>spark02:8030</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm2</name>
<value>spark02:8088</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm2</name>
<value>spark02:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address.rm2</name>
<value>spark02:8033</value>
</property>
<property>
<name>yarn.resourcemanager.ha.admin.address.rm2</name>
<value>spark02:23142</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm1</name>
<value>spark01:8030</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm1</name>
<value>spark01:8088</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm1</name>
<value>spark01:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address.rm1</name>
<value>spark01:8033</value>
</property>
<property>
<name>yarn.resourcemanager.ha.admin.address.rm1</name>
<value>spark01:23142</value>
</property>
<!--指定resourcemanager的状态信息存储在zookeeper集群,默认是存放在FileSystem里面。-->
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
<description>
</description>
</property>
<!--磁盘空间利用率的最大百分比,在该百分比之后,磁盘被标记为坏-->
<property>
<name>yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage</name>
<value>95.0</value>
</property>
<!-- 每个节点可用内存,单位MB,默认是8g -->
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>3072</value>
</property>
<!-- 单个任务可申请最少内存,默认1024MB -->
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>1024</value>
</property>
<!-- 单个任务可申请最大内存,默认8192MB -->
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>3072</value>
</property>
<!-- 可以分配给容器的CPU核数 -->
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>4</value>
</property>
<!-- 单个任务最小可申请的虚拟核心数,默认为1 -->
<property>
<name>yarn.scheduler.minimum-allocation-vcores</name>
<value>1</value>
</property>
<!-- 单个任务最大可申请的虚拟核心数,默认为4 -->
<property>
<name>yarn.scheduler.maximum-allocation-vcores</name>
<value>4</value>
</property>
</configuration>
hadoop@spark01:/usr/local/hadoop/etc/hadoop$ vim hadoop-env.sh
hadoop@spark01:/usr/local/hadoop/etc/hadoop$ sed -n "54p" hadoop-env.sh
export JAVA_HOME=/usr/local/jdk
hadoop@spark01:~$ hadoop-daemon.sh start journalnode
hadoop@spark02:~$ hadoop-daemon.sh start journalnode
hadoop@spark03:~$ hadoop-daemon.sh start journalnode
hadoop@spark04:~$ hadoop-daemon.sh start journalnode
#格式化namenode
hadoop@spark01:~$ hdfs namenode -format
#将格式化后的hadoop文件发送给hadoop-spark02(另一个namenode)
hadoop@spark01:~$ scp -r /usr/local/hadoop hadoop@spark02:/usr/local/
hadoop@spark01:~$ hdfs zkfc -formatZK
启动需要在Master节点上进行,执行如下命令:
#在start-dfs.sh和stop-dfs.sh文件中加上下列代码
hadoop@spark01:~$ cd /usr/local/hadoop/sbin/
hadoop@spark01:/usr/local/hadoop/sbin$ vim start-dfs.sh
hadoop@spark01:/usr/local/hadoop/sbin$ tail -6 start-dfs.sh
HDFS_NAMENODE_USER=hadoop
HDFS_DATANODE_USER=hadoop
HDFS_DATANODE_SECURE_USER=hadoop
HDFS_SECONDARYNAMENODE_USER=hadoop
HDFS_JOURNALNODE_USER=hadoop
HDFS_ZKFC_USER=hadoop
hadoop@spark01:/usr/local/hadoop/sbin$ vim stop-dfs.sh
hadoop@spark01:/usr/local/hadoop/sbin$ tail -6 stop-dfs.sh
HDFS_NAMENODE_USER=hadoop
HDFS_DATANODE_USER=hadoop
HDFS_DATANODE_SECURE_USER=hadoop
HDFS_SECONDARYNAMENODE_USER=hadoop
HDFS_JOURNALNODE_USER=hadoop
HDFS_ZKFC_USER=hadoop
#在start-yarn.sh和stop-yarn.sh文件中加上下列代码
hadoop@spark01:/usr/local/hadoop/sbin$ vim start-yarn.sh
hadoop@spark01:/usr/local/hadoop/sbin$ tail -3 start-yarn.sh
YARN_RESOURCEMANAGER_USER=hadoop
HADOOP_SECURE_DN_USER=hadoop
YARN_NODEMANAGER_USER=hadoop
hadoop@spark01:/usr/local/hadoop/sbin$ vim stop-yarn.sh
hadoop@spark01:/usr/local/hadoop/sbin$ tail -3 stop-yarn.sh
YARN_RESOURCEMANAGER_USER=hadoop
HADOOP_SECURE_DN_USER=hadoop
YARN_NODEMANAGER_USER=hadoop
#启动hadoop
hadoop@spark01:/usr/local/hadoop/sbin$ start-all.sh
#启动web查看作业的历史运行情况
hadoop@spark01:/usr/local/hadoop/sbin$ mr-jobhistory-daemon.sh start historyserver
#spark01
hadoop@spark01:/usr/local/hadoop/sbin$ jps
2832 DataNode
2705 NameNode
4118 Jps
4055 JobHistoryServer
3543 ResourceManager
2361 JournalNode
3675 NodeManager
3211 DFSZKFailoverController
#spark02
hadoop@spark02:/usr/local/hadoop/sbin$ jps
3024 NodeManager
3176 Jps
2506 NameNode
2586 DataNode
2826 DFSZKFailoverController
2330 JournalNode
2940 ResourceManager
#spark03
hadoop@spark03:/usr/local/hadoop/sbin$ jps
2325 JournalNode
2747 Jps
2636 NodeManager
2447 DataNode
#spark04
hadoop@spark04:/usr/local/hadoop/sbin$ jps
2772 JournalNode
3146 Jps
3037 NodeManager
2895 DataNode
其核心就是Live datanodes
不为 0
hadoop@spark01:/usr/local/hadoop/sbin$ hdfs dfsadmin -report
Configured Capacity: 207955689472 (193.67 GB)
Present Capacity: 148957691904 (138.73 GB)
DFS Remaining: 148957593600 (138.73 GB)
DFS Used: 98304 (96 KB)
DFS Used%: 0.00%
Replicated Blocks:
Under replicated blocks: 0
Blocks with corrupt replicas: 0
Missing blocks: 0
Missing blocks (with replication factor 1): 0
Low redundancy blocks with highest priority to recover: 0
Pending deletion blocks: 0
Erasure Coded Block Groups:
Low redundancy block groups: 0
Block groups with corrupt internal blocks: 0
Missing block groups: 0
Low redundancy blocks with highest priority to recover: 0
Pending deletion blocks: 0
-------------------------------------------------
Live datanodes (4):
Name: 192.168.200.46:9866 (spark01)
Hostname: spark01
Decommission Status : Normal
Configured Capacity: 51988922368 (48.42 GB)
DFS Used: 24576 (24 KB)
Non DFS Used: 12092940288 (11.26 GB)
DFS Remaining: 37221879808 (34.67 GB)
DFS Used%: 0.00%
DFS Remaining%: 71.60%
Configured Cache Capacity: 0 (0 B)
Cache Used: 0 (0 B)
Cache Remaining: 0 (0 B)
Cache Used%: 100.00%
Cache Remaining%: 0.00%
Xceivers: 1
Last contact: Sun Jun 26 22:58:08 CST 2022
Last Block Report: Sun Jun 26 22:54:49 CST 2022
Num of Blocks: 0
Name: 192.168.200.47:9866 (spark02)
Hostname: spark02
Decommission Status : Normal
Configured Capacity: 51988922368 (48.42 GB)
DFS Used: 24576 (24 KB)
Non DFS Used: 12083855360 (11.25 GB)
DFS Remaining: 37230964736 (34.67 GB)
DFS Used%: 0.00%
DFS Remaining%: 71.61%
Configured Cache Capacity: 0 (0 B)
Cache Used: 0 (0 B)
Cache Remaining: 0 (0 B)
Cache Used%: 100.00%
Cache Remaining%: 0.00%
Xceivers: 1
Last contact: Sun Jun 26 22:58:07 CST 2022
Last Block Report: Sun Jun 26 22:54:51 CST 2022
Num of Blocks: 0
Name: 192.168.200.48:9866 (spark03)
Hostname: spark03
Decommission Status : Normal
Configured Capacity: 51988922368 (48.42 GB)
DFS Used: 24576 (24 KB)
Non DFS Used: 12107624448 (11.28 GB)
DFS Remaining: 37207195648 (34.65 GB)
DFS Used%: 0.00%
DFS Remaining%: 71.57%
Configured Cache Capacity: 0 (0 B)
Cache Used: 0 (0 B)
Cache Remaining: 0 (0 B)
Cache Used%: 100.00%
Cache Remaining%: 0.00%
Xceivers: 1
Last contact: Sun Jun 26 22:58:07 CST 2022
Last Block Report: Sun Jun 26 22:54:43 CST 2022
Num of Blocks: 0
Name: 192.168.200.49:9866 (spark04)
Hostname: spark04
Decommission Status : Normal
Configured Capacity: 51988922368 (48.42 GB)
DFS Used: 24576 (24 KB)
Non DFS Used: 12017266688 (11.19 GB)
DFS Remaining: 37297553408 (34.74 GB)
DFS Used%: 0.00%
DFS Remaining%: 71.74%
Configured Cache Capacity: 0 (0 B)
Cache Used: 0 (0 B)
Cache Remaining: 0 (0 B)
Cache Used%: 100.00%
Cache Remaining%: 0.00%
Xceivers: 1
Last contact: Sun Jun 26 22:58:07 CST 2022
Last Block Report: Sun Jun 26 22:54:43 CST 2022
Num of Blocks: 0
也可以在Linux系统的浏览器中输入地址http://spark01:50070/,通过 Web 页面看到查看名称节点和数据节点的状态。如果不成功,可以通过启动日志排查原因。
至此,就顺利完成了Hadoop集群搭建。
hadoop@spark01:~$ sudo tar xf spark-3.2.0-bin-hadoop3.2.tgz -C /usr/local/
hadoop@spark01:~$ sudo mv /usr/local/spark-3.2.0-bin-hadoop3.2 /usr/local/spark
hadoop@spark01:~$ sudo chown -R hadoop:hadoop /usr/local/spark/
hadoop@spark01:~$ cd /usr/local/spark/conf/
hadoop@spark01:/usr/local/spark/conf$ cp spark-env.sh.template spark-env.sh
hadoop@spark01:/usr/local/spark/conf$ vim spark-env.sh
hadoop@spark01:/usr/local/spark/conf$ tail -22 spark-env.sh
# jdk
export JAVA_HOME=/usr/local/jdk
# Hadoop目录
export HADOOP_HOME=/usr/local/hadoop
# Hadoop的配置文件目录
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
# YARN 的配置文件目录
export YARN_CONF_DIR=$HADOOP_HOME/etc/hadoop
# SPARK 的目录
export SPARK_HOME=/usr/local/spark
# SPARK 执行文件目录
export PATH=$SPARK_HOME/bin:$PATH
# Master节点
export SPARK_MASTER_HOST=spark01
# 任务提交端口
export SPARK_MASTER_PORT=7077
#修改spark监视窗口的端口
export SPARK_MASTER_WEBUI_PORT=8089
hadoop@spark01:/usr/local/spark/conf$ cp workers.template workers
hadoop@spark01:/usr/local/spark/conf$ vim workers
hadoop@spark01:/usr/local/spark/conf$ tail -4 workers
spark01
spark02
spark03
spark04
hadoop@spark01:/usr/local/spark/conf$ vim ~/.bashrc
hadoop@spark01:/usr/local/spark/conf$ tail -3 ~/.bashrc
#spark
export SPARK_HOME=/usr/local/spark
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
hadoop@spark01:/usr/local/spark/conf$ source ~/.bashrc
#由于spark的启动文件名会与Hadoop集群的启动文件名发生冲突,所以修改spark的启动文件名
hadoop@spark01:/usr/local/spark/conf$ cd /usr/local/spark/sbin/
hadoop@spark01:/usr/local/spark/sbin$ mv start-all.sh start-all-spark.sh
hadoop@spark01:/usr/local/spark/sbin$ mv stop-all.sh stop-all-spark.sh
启动spark集群
hadoop@hadoop-spark01:/usr/local/spark/sbin$ start-all-spark.sh
hadoop@spark01:~$ vim test.json
hadoop@spark01:~$ cat test.json
{"DEST_COUNTRY_NAME":"United States","ORIGIN_COUNTRY_NAME":"Romania","count":1}
hadoop@spark01:~$ spark-shell
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-06-27 11:39:33,573 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Spark context Web UI available at http://spark01:4040
Spark context available as 'sc' (master = local[*], app id = local-1656301174643).
Spark session available as 'spark'.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 3.2.0
/_/
Using Scala version 2.12.15 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_162)
Type in expressions to have them evaluated.
Type :help for more information.
scala>
scala> val testDF = spark.read.json("file:///home/hadoop/test.json")
testDF: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]
scala> testDF.write.format("parquet").save("/spark-dir/parquet/test")
终端的页面如下
访问hadoopUI
产生以下文件说明spark搭建成功
#创建用户
hadoop@spark01:~$ sudo adduser usera --home /home/usera
hadoop@spark01:~$ sudo adduser userb --home /home/userb
hadoop@spark01:~$ sudo adduser userc --home /home/userc
#加入hadoop用户组
hadoop@spark01:~$ sudo gpasswd -a usera hadoop
hadoop@spark01:~$ sudo gpasswd -a userb hadoop
hadoop@spark01:~$ sudo gpasswd -a userc hadoop
#在hdfs上/user目录下创建用户目录
hadoop@spark01:~$ hdfs dfs -mkdir -p /user/usera
hadoop@spark01:~$ hdfs dfs -mkdir -p /user/userb
hadoop@spark01:~$ hdfs dfs -mkdir -p /user/userc
#hdfs用户目录授权
hadoop@spark01:~$ hdfs dfs -chown -R usera:usera /user/usera
hadoop@spark01:~$ hdfs dfs -chown -R userb:usera /user/userb
hadoop@spark01:~$ hdfs dfs -chown -R userc:usera /user/userc
hadoop@spark01:~$ hdfs dfs -chmod -R 770 /user/usera
hadoop@spark01:~$ hdfs dfs -chmod -R 770 /user/userb
hadoop@spark01:~$ hdfs dfs -chmod -R 770 /user/userc
#copy环境变量文件到新用户下
hadoop@spark01:~$ sudo cp /home/hadoop/.bashrc /home/usera/
hadoop@spark01:~$ sudo cp /home/hadoop/.bashrc /home/userb/
hadoop@spark01:~$ sudo cp /home/hadoop/.bashrc /home/userc/
#使环境变量生效
hadoop@spark01:~$ su - usera
hadoop@spark01:~$ source .bashrc
hadoop@spark01:~$ su - userb
hadoop@spark01:~$ source .bashrc
hadoop@spark01:~$ su - userc
hadoop@spark01:~$ source .bashrc
root@spark02:~# ln -fs /lib/systemd/system/rc-local.service /etc/systemd/system/rc-local.service
#在文件末尾增加
root@spark02:~# vim /etc/systemd/system/rc-local.service
root@spark02:~# tail -3 /etc/systemd/system/rc-local.service
[Install]
WantedBy=multi-user.target
Alias=rc-local.service
#创建/etc/rc.local文件
root@spark02:~# touch /etc/rc.local
#添加执行权限
root@spark02:~# chmod +x /etc/rc.local
vim /etc/rc.local
##spark01
#启动zookeeper
sudo -u hadoop sh -c '/usr/local/zookeeper/bin/zkServer.sh start'
#启hadoop
sudo -u hadoop sh -c '/usr/local/hadoop/sbin/start-all.sh'
#启动web查看作业的历史运行情况
sudo -u hadoop sh -c '/usr/local/hadoop/sbin/mr-jobhistory-daemon.sh start historyserver'
#启动spark
sudo -u hadoop sh -c '/usr/local/spark/sbin/start-all-spark.sh'
##spark02
#启动zookeeper
sudo -u hadoop sh -c '/usr/local/zookeeper/bin/zkServer.sh start'
##spark03
#启动zookeeper
sudo -u hadoop sh -c '/usr/local/zookeeper/bin/zkServer.sh start'
##spark04
#报错
WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
#解决(无需重启)
hadoop@spark01:~$ vim /usr/local/spark/conf/spark-env.sh
hadoop@spark01:~$ tail -1 /usr/local/spark/conf/spark-env.sh
export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native
#报错
WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
#官网相关说明
To make Spark runtime jars accessible from YARN side, you can specify spark.yarn.archive or spark.yarn.jars. For details please refer toSpark Properties. If neither spark.yarn.archive nor spark.yarn.jars is specified, Spark will create a zip file with all jars under $SPARK_HOME/jarsand upload it to the distributed cache.
#解决(无需重启)
#本地spark jar包需要上传到hdfs
hadoop@spark01:~$ hdfs dfs -mkdir -p /system/spark-jars
hadoop@spark01:~$ hdfs dfs -put /usr/local/spark/jars/* /system/spark-jars/
hadoop@spark01:~$ hdfs dfs -chmod -R 755 /system/spark-jars/
#修改spark-default.conf配置文件
hadoop@spark01:~$ cd /usr/local/spark/conf/
hadoop@spark01:/usr/local/spark/conf$ cp spark-defaults.conf.template spark-defaults.conf
hadoop@spark01:/usr/local/spark/conf$ vim spark-defaults.conf
hadoop@spark01:/usr/local/spark/conf$ tail -1 spark-defaults.conf
spark.yarn.jars hdfs://spark01:9000//system/spark-jars/*