pyspark使用jupyter-lab调试

1:后台启动jupyter

nohup jupyter lab --port 9000 --ip 0.0.0.0 &

2: 配置spark路径

import os
import sys
reload(sys)
sys.setdefaultencoding("utf8")
 
# spark_name = os.environ.get(‘SPARK_HOME‘,None)
spark_name = /home/work/local/spark/  # 注意spark路径
sys.path.insert(0, os.path.join(spark_name,python))
sys.path.insert(0, os.path.join(spark_name,python/lib/py4j-0.10.4-src.zip))
 
from pyspark.sql import session
import pyspark.sql.functions as F
from pyspark.sql.types import *
 
user_name = "xxx"
app_name = "spark_debug"
 
spark = session.SparkSession.builder.appName("{user_name}-jupyter-{app_name}".format(user_name=user_name, app_name=app_name))     .config("spark.driver.maxResultSize", "6g")     .config("spark.driver.memory", "8g")     .config("spark.executor.memory", "12g")     .config("spark.executor.instances", "32")     .config("spark.dynamicAllocation.maxExecutors", "32")     .config("spark.yarn.dist.archives", "hdfs://ns-fed/user/strategy/yudonghai/python.tgz#python")     .config("spark.pyspark.python", "python/bin/python2.7")     .config("spark.sql.shuffle.partitions", "500")     .enableHiveSupport().getOrCreate()
 
application_id = spark._sc.applicationId

注意手动kill任务释放资源

pyspark使用jupyter-lab调试

上一篇:leetcode-华为专题-103. 二叉树的锯齿形层序遍历


下一篇:C#泛型List的用法