pyspark


# Example

from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("boye").getOrCreate()
#spark = SparkSession.builder.appName("test").master("local[2]").getOrCreate() #运行在本地(local),2个线程
sc = spark.sparkContext
sc = spark.sparkContext
datas = ["hi I love you", "hello", "ni hao"]
sc = spark.sparkContext
rdd = sc.parallelize(datas).filter(lambda x:x.__contains__("he"))
print(rdd.collect())
print(rdd.count())

 

#配置环境变量

export SPARK_HOME=spark-2.4.3-bin-hadoop2.7
export PATH$SPARK_HOME/bin:$PATH
export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.4-src. zip:$PYTHON PATH
export PYSPARK_PYTHON=/opt/local/python/bin/python3
export PYSPARK_DRIVER_PYTHON=/opt/local/python/bin/python3

 运行:spark-submit  --master local[*] spark_001.py

 

上一篇:python – 使用UDF忽略条件


下一篇:kafka+pyspark