Spark shell
最后发布时间 : 2023-11-05 22:49:28
浏览量 :
启动spark
bin/spark-shell
scala> val licLines = sc.textFile("LICENSE")
licLines: org.apache.spark.rdd.RDD[String] = LICENSE MapPartitionsRDD[1] at textFile at <console>:23
scala> val lineCnt = licLines.count
lineCnt: Long = 56
过滤包含BSD
字符串的行
cala> val bsdLines = licLines.filter(line=>line.contains("BSD"))
bsdLines: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[2] at filter at <console>:23
# Every record contains a label and feature vector
df = spark.createDataFrame(data, ["label", "features"])
# Split the data into train/test datasets
train_df, test_df = df.randomSplit([.80, .20], seed=42)
# Set hyperparameters for the algorithm
rf = RandomForestRegressor(numTrees=100)
# Fit the model to the training data
model = rf.fit(train_df)
# Generate predictions on the test dataset.
model.transform(test_df).show()