1
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
object Demo1Sess {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local")
.appName("Demo1Sess")
.config("spark.sql.shuffle.partitions",3)
.getOrCreate()
val stuDF: DataFrame = spark
.read
.format("json")
.load("spark/data/students.json")
stuDF.show()
val stucsDF: DataFrame = spark
.read
.format("csv")
.schema("id String,name String,age Int,gender String,clazz String")
.load("scala/data/students.txt")
stucsDF.show()
stucsDF.createOrReplaceTempView("stu")
val ageDF: DataFrame = spark.sql("select * from stu where age=22")
ageDF.show()
val dslDF: DataFrame = stucsDF.where("age=23")
.select("name", "age", "clazz")
dslDF.show()
stucsDF.groupBy("clazz")
.count()
.write
.mode(SaveMode.Overwrite)
.save("spark/data/clazz_cnt")
}
}
2
import Practice.Student
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession}
object Demo2CreateDF {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local")
.appName("Demo2CreateDF")
.config("spark.sql.shuffle.partitions", 3)
.getOrCreate()
val jsonDF: DataFrame = spark.read
.format("json")
.load("spark/data/students.json")
val csvDF: DataFrame = spark.read
.format("csv")
.option("sep", ",")
.schema("id String,name String,age Int,gender String,clazz String")
.load("scala/data/students.txt")
val jdbcDF: DataFrame = spark.read
.format("jdbc")
.option("url", "jdbc:mysql://master:3306/student")
.option("dbtable", "student")
.option("user", "root")
.option("password", "123456")
.load()
spark.read
.format("parquet")
.load("spark/data/stu_parquet")
val stuRDD: RDD[String] = spark.sparkContext.textFile("scala/data/students.txt")
val stuRDD2: RDD[Student] = stuRDD.map(line => {
val splits: Array[String] = line.split(",")
val id: String = splits(0)
val name: String = splits(1)
val age: String = splits(2)
val gender: String = splits(3)
val clazz: String = splits(4)
Student(id, name, age, gender, clazz)
})
import spark.implicits._
val sDF: DataFrame = stuRDD2.toDF()
sDF.show()
val rdd: RDD[Row] = sDF.rdd
rdd.foreach(row=>{
val id: String = row.getAs[String]("id")
val name: String = row.getAs[String]("name")
println(s"$id,$name")
})
}
case class Student(id:String,name:String,age: String, gender: String, clazz: String)
}
3
import org.apache.spark.sql.{DataFrame, SparkSession}
object DFapi {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local")
.appName("DFapi")
.config("spark.sql.shuffle.partitions", 2)
.getOrCreate()
import spark.implicits._
val stuDF: DataFrame = spark.read
.format("csv")
.option("sep", ",")
.schema("id String,name String,age String,gender String,clazz String")
.load("scala/data/students.txt")
stuDF.cache()
stuDF.where("age>23")
stuDF.where($"age" > 23)
stuDF.filter(row => {
val age: String = row.getAs[String]("age")
if (age.toInt > 23) {
true
}
else {
false
}
})
stuDF.select($"id", $"name", $"age" + 100 as "newage")
stuDF.groupBy($"clazz")
.count().show()
import org.apache.spark.sql.functions._
stuDF.groupBy($"clazz", $"gender")
.agg(count($"gender"))
.show()
stuDF.groupBy($"clazz")
.agg(countDistinct($"id") as "去重人数")
.show()
stuDF.createOrReplaceTempView("stu")
spark.sql(
"""
|select clazz,count(distinct id)
|from stu
|group by clazz
""".stripMargin
).show()
val scoreDF: DataFrame = spark.read
.format("csv")
.schema("sid String,sub_id String,score Int")
.load("scala/data/score.txt")
stuDF.join(scoreDF,$"id"===$"sid","left").show()
stuDF.unpersist()
}
}
4
import org.apache.spark.sql.{DataFrame, SparkSession}
object DianXin {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local")
.appName("DianXin")
.config("spark.sql.shuffle.partitions", 2)
.getOrCreate()
val dxDF: DataFrame = spark.read
.format("csv")
.option("sep", ",")
.schema("mdn String,grid_id String,city_id String,county_id String,t String,start_time String,end_time String,date String")
.load("spark/data/dianxin_data")
import spark.implicits._
import org.apache.spark.sql.functions._
dxDF.createOrReplaceTempView("dx")
spark.sql(
"""
|select tt1.city_id,tt1.county_id,tt1.sum,tt1.rk
|from
|(select t1. city_id,t1.county_id,t1.sum,row_number() over (partition by county_id order by t1.sum desc) as rk
|from
|(select city_id,county_id,count(distinct mdn) as sum
|from
|dx
|group by city_id,county_id) t1) tt1
|where tt1.rk<3
|
""".stripMargin
).show()
}
}
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】Flutter适配HarmonyOS 5知识地图,实战解析+高频避坑指南
【推荐】凌霞软件回馈社区,携手博客园推出1Panel与Halo联合会员
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 如何反向绘制出 .NET程序 异步方法调用栈
· 领域驱动设计实战:聚合根设计与领域模型实现
· 突破Excel百万数据导出瓶颈:全链路优化实战指南
· 如何把ASP.NET Core WebApi打造成Mcp Server
· Linux系列:如何用perf跟踪.NET程序的mmap泄露
· 聊聊 ruoyi-vue ,ruoyi-vue-plus ,ruoyi-vue-pro 谁才是真正的
· C#开发的Panel滚动分页控件 - 开源研究系列文章
· 如何反向绘制出 .NET程序 异步方法调用栈
· ShadowSql之开源不易
· Python 3.14 新特性盘点,更新了些什么?