[root@centos00 ~]$ cd hadoop-2.6.0-cdh5.14.2/ [root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start namenode [root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start datanode [root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/yarn-daemon.sh start resourcemanager [root@centos00 ~]$ cd /opt/cdh5.14.2/hive-1.1.0-cdh5.14.2/ [root@centos00 hive-1.1.0-cdh5.14.2]$ bin/hive --service metastore & [root@centos00 ~]$ cd /opt/cdh5.14.2/spark-2.2.1-cdh5.14.2/ [root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-master.sh [root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-slaves.sh scala> import org.apache.spark.sql.functions._ import org.apache.spark.sql.functions._ scala> val arr = Array(("a", "20"), ("a", "30"), ("b", "20"), ("a", "20")) arr: Array[(String, String)] = Array((a,20), (a,30), (b,20), (a,20)) scala> val df = sc.parallelize(arr).toDF("id", "age") df: org.apache.spark.sql.DataFrame = [id: string, age: string] scala> df.show(false) +---+---+ |id |age| +---+---+ |a |20 | |a |30 | |b |20 | |a |20 | +---+---+ scala> df.groupBy(‘id).agg(countDistinct(‘age) as ‘distinctAge).show(false) +---+-----------+ |id |distinctAge| +---+-----------+ |b |1 | |a |2 | +---+-----------+ scala> df.groupBy("id").agg(countDistinct("age") as "distinctAge").show(false) +---+-----------+ |id |distinctAge| +---+-----------+ |b |1 | |a |2 | +---+-----------+