pythonspark教材_林子雨编著《Spark编程基础(Python版)》教材第3章的命令行和代码...

林子雨、郑海山、赖永炫编著《Spark编程基础(Python版)》(教材官网)教材中的代码,在纸质教材中的印刷效果,可能会影响读者对代码的理解,为了方便读者正确理解代码或者直接拷贝代码用于上机实验,这里提供全书配套的所有代码。

查看所有章节代码

第3章 Spark环境搭建和使用方法

sudo tar -zxf ~/下载/spark-2.4.0-bin-without-hadoop.tgz -C /usr/local/

cd /usr/local

sudo mv ./spark-2.4.0-bin-without-hadoop ./spark

sudo chown -R hadoop:hadoop ./spark # hadoop是当前登录Linux系统的用户名

cd /usr/local/spark

cp ./conf/spark-env.sh.template ./conf/spark-env.sh

export SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop classpath)

vim ~/.bashrc

export JAVA_HOME=/usr/lib/jvm/jdk1.8.0_162

export JRE_HOME=${JAVA_HOME}/jre

export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib

export PATH=$PATH:${JAVA_HOME}/bin:/usr/local/hbase/bin

export HADOOP_HOME=/usr/local/hadoop

export SPARK_HOME=/usr/local/spark

export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip:$PYTHONPATH

export PYSPARK_PYTHON=python3

export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH

source ~/.bashrc

cd /usr/local/spark

./bin/run-example SparkPi

./bin/run-example SparkPi 2>&1 | grep "Pi is roughly"

cd /usr/local/hadoop

./sbin/start-dfs.sh

./sbin/stop-dfs.sh

pyspark --master

cd /usr/local/spark

./bin/pyspark --master local[4]

cd /usr/local/spark

./bin/pyspark --master local[4] --jars code.jar

cd /usr/local/spark

./bin/pyspark --help

cd /usr/local/spark

./bin/pyspark

PYSPARK_PYTHON=python3

cd /usr/local/spark

./bin/pyspark

>>> 8*2+5

>>> exit()

WordCount.py

from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("My App")

sc = SparkContext(conf = conf)

logFile = "file:///usr/local/spark/README.md"

logData = sc.textFile(logFile, 2).cache()

numAs = logData.filter(lambda line: 'a' in line).count()

numBs = logData.filter(lambda line: 'b' in line).count()

print('Lines with a: %s, Lines with b: %s' % (numAs, numBs))

cd /usr/local/spark/mycode/python

python3 WordCount.py

cd /usr/local/spark

./bin/spark-submit --help

/usr/local/spark/bin/spark-submit /usr/local/spark/mycode/python/WordCount.py

/usr/local/spark/bin/spark-submit \

> /usr/local/spark/mycode/python/WordCount.py

cd /usr/local/spark/conf

sudo mv log4j.properties.template log4j.properties

vim log4j.properties

log4j.rootCategory=ERROR, console

sudo tar -zxf ~/下载/spark-2.4.0-bin-without-hadoop.tgz -C /usr/local/

cd /usr/local

sudo mv ./spark-2.4.0-bin-without-hadoop ./spark

sudo chown -R hadoop:hadoop ./spark # hadoop是当前登录Linux系统的用户名

vim ~/.bashrc

export SPARK_HOME=/usr/local/spark

export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin

source ~/.bashrc

cd /usr/local/spark/

cp ./conf/slaves.template ./conf/slaves

Slave01

Slave02

cp ./conf/spark-env.sh.template ./conf/spark-env.sh

export SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop classpath)

export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop

export SPARK_MASTER_IP=192.168.1.104

cd /usr/local/

tar -zcf ~/spark.master.tar.gz ./spark

cd ~

scp ./spark.master.tar.gz Slave01:/home/hadoop

scp ./spark.master.tar.gz Slave02:/home/hadoop

sudo rm -rf /usr/local/spark/

sudo tar -zxf ~/spark.master.tar.gz -C /usr/local

sudo chown -R hadoop /usr/local/spark

cd /usr/local/hadoop/

sbin/start-all.sh

cd /usr/local/spark/

sbin/start-master.sh

cd /usr/local/spark/

sbin/start-slaves.sh

sbin/stop-master.sh

sbin/stop-slaves.sh

cd /usr/local/hadoop/

sbin/stop-all.sh

cd /usr/local/hadoop/

sbin/start-all.sh

cd /usr/local/spark/

sbin/start-master.sh

sbin/start-slaves.sh

cd /usr/local/spark/

bin/spark-submit \

> --master spark://master:7077 \

> /usr/local/spark/examples/src/main/python/pi.py 2>&1 | grep "Pi is roughly"

cd /usr/local/spark/

bin/pyspark --master spark://master:7077

>>> textFile = sc.textFile("hdfs://master:9000/README.md")

>>> textFile.count()

105

>>> textFile.first()

‘# Apache Spark’

cd /usr/local/spark/

bin/spark-submit \

> --master yarn-client \

> /usr/local/spark/examples/src/main/python/pi.py

cd /usr/local/spark/

bin/pyspark --master yarn

>>> textFile = sc.textFile("hdfs://master:9000/README.md")

>>> textFile.count()

105

>>> textFile.first()

‘# Apache Spark’