1、安装pyarrow加速
pyspark 2.3 对应pyarrow的版本是0.14.1
2、pycharm需要配置的环境量
HADOOP_HOME /opt/hdp/2.3.4.0-315/hadoop
SPARK_HOME /opt/hdp/2.3.4.0-315/spark2
PYTHONPATH /data/soft/anaconda3/envs/py37/bin/python
PYSPARK_PYTHON /data/soft/anaconda3/envs/py37/bin/python
PYSPARK_DRIVER_PYTHON /data/soft/anaconda3/envs/py37/bin/python
HADOOP_USER_NAME user
后面的value需要根据实际value进行替换
3、初始化上下文
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, HiveContext
sparkConf = SparkConf()
# sparkConf.set("spark.pyspark.python", "py37/bin/python")
# sparkConf.set("spark.yarn.dist.archives", "hdfs://pda/user/pyspark.zip#py37")
sparkConf.setMaster("yarn")
#sparkConf.setMaster("local")
# 设置Driver进程的内存
sparkConf.set('spark.driver.memory', '2G')
sparkConf.set("spark.submit.deployMode", "cluster")
# 设置Driver的CPU core数量
sparkConf.set('spark.driver.cores', '2')
# 设置Spark作业总共要用多少个Executor进程来执行
sparkConf.set("spark.executor.instances", "2")
# 设置每个Executor进程的CPU core数量
sparkConf.set("spark.executor.cores", "2")
# 设置每个task的core数量
# sparkConf.set("spark.task.cpus", "1")
# 设置每个Executor进程的内存
sparkConf.set("spark.executor.memory", "2G")
# 设置Spark应用的名称
sparkConf.set("spark.app.name", "push_send")
sparkConf.set("spark.sql.shuffle.partitions", "600")
# 设置Executor进程的CPU core数量
# 注意:请确保"spark.kubernetes.executor.limit.cores"参数值 >= "spark.executor.cores"参数值,否者spark executor启动不起来
# sparkConf.set("spark.kubernetes.executor.limit.cores", "2")
# 打开pyarrow
sparkConf.set("spark.sql.execution.arrow.enabled", "true")
spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
sc = spark.sparkContext
hiveContext = HiveContext(sc)
l = [('Alice', 1)]
spark.createDataFrame(l).collect()
print(spark.createDataFrame(l, ['name', 'age']).collect())
4、提交任务
nohup spark-submit --master yarn --deploy-mode cluster --num-executors 2 --executor-memory 1G main.py &