score:1

Accepted answer

i think combination of tokenizer and explode might work. the solution is given as below:

scala> val data = spark.read.format("csv").option("delimiter", "\t").schema(schema).load("plot_summaries.txt")
data: org.apache.spark.sql.dataframe = [documentid: bigint, description: string]

scala> data.show(1)
+----------+--------------------+
|documentid|         description|
+----------+--------------------+
|  23890098|shlykov, a hard-w...|
+----------+--------------------+
only showing top 1 row


scala> import org.apache.spark.sql.functions.explode
import org.apache.spark.sql.functions.explode

scala> import org.apache.spark.ml.feature.tokenizer
import org.apache.spark.ml.feature.tokenizer

scala> val tokenizer = new tokenizer().setinputcol("description").setoutputcol("words")
tokenizer: org.apache.spark.ml.feature.tokenizer = tok_80d1c6e72cbc

scala> val wordsdata = tokenizer.transform(data)
wordsdata: org.apache.spark.sql.dataframe = [documentid: bigint, description: string ... 1 more field]

scala> wordsdata.show(1)
+----------+--------------------+--------------------+
|documentid|         description|               words|
+----------+--------------------+--------------------+
|  23890098|shlykov, a hard-w...|[shlykov,, a, har...|
+----------+--------------------+--------------------+
only showing top 1 row


scala> val newwordsdata = wordsdata.drop("description")
newwordsdata: org.apache.spark.sql.dataframe = [documentid: bigint, words: array<string>]

scala> newwordsdata.show(1)
+----------+--------------------+
|documentid|               words|
+----------+--------------------+
|  23890098|[shlykov,, a, har...|
+----------+--------------------+
only showing top 1 row


scala> val flattened = newwordsdata.withcolumn("token",explode($"words"))
flattened: org.apache.spark.sql.dataframe = [documentid: bigint, words: array<string> ... 1 more field]

scala> flattened.show
+----------+--------------------+-------------+
|documentid|               words|        token|
+----------+--------------------+-------------+
|  23890098|[shlykov,, a, har...|     shlykov,|
|  23890098|[shlykov,, a, har...|            a|
|  23890098|[shlykov,, a, har...| hard-working|
|  23890098|[shlykov,, a, har...|         taxi|
|  23890098|[shlykov,, a, har...|       driver|
|  23890098|[shlykov,, a, har...|          and|
|  23890098|[shlykov,, a, har...|      lyosha,|
|  23890098|[shlykov,, a, har...|            a|
|  23890098|[shlykov,, a, har...| saxophonist,|
|  23890098|[shlykov,, a, har...|      develop|
|  23890098|[shlykov,, a, har...|            a|
|  23890098|[shlykov,, a, har...|      bizarre|
|  23890098|[shlykov,, a, har...|    love-hate|
|  23890098|[shlykov,, a, har...|relationship,|
|  23890098|[shlykov,, a, har...|          and|
|  23890098|[shlykov,, a, har...|      despite|
|  23890098|[shlykov,, a, har...|        their|
|  23890098|[shlykov,, a, har...|  prejudices,|
|  23890098|[shlykov,, a, har...|      realize|
|  23890098|[shlykov,, a, har...|         they|
+----------+--------------------+-------------+
only showing top 20 rows

let me know if it helps!!

score:1

first split the description data to get array and then explode the array to get individual words as rows associated with the documentid.

example:

import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

//sampledata
val csvdata: dataset[string] = spark.sparkcontext.parallelize(
  """
|23890098   shlykov, a hard-working taxi driver and lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.
  """.stripmargin.lines.tolist).tods()

val schema = new structtype().add("documentid", longtype, true).add("description", stringtype, true)

//read the dataset
val data=spark.read.option("delimiter", "\t").schema(schema).csv(csvdata)

data.withcolumn("description",split(col("description")," ")).//split description to get array of words
    withcolumn("token", explode(col("description"))). //explode on array and get tokens(each word as individual row with documentid)
    show(false)

//+----------+--------------------+-------------+
//|documentid|         description|        token|
//+----------+--------------------+-------------+
//|  23890098|[shlykov,, a, har...|     shlykov,|
//|  23890098|[shlykov,, a, har...|            a|
//|  23890098|[shlykov,, a, har...| hard-working|
//|  23890098|[shlykov,, a, har...|         taxi|
//|  23890098|[shlykov,, a, har...|       driver|
//|  23890098|[shlykov,, a, har...|          and|
//|  23890098|[shlykov,, a, har...|      lyosha,|
//|  23890098|[shlykov,, a, har...|            a|
//|  23890098|[shlykov,, a, har...| saxophonist,|
//|  23890098|[shlykov,, a, har...|      develop|
//|  23890098|[shlykov,, a, har...|            a|
//|  23890098|[shlykov,, a, har...|      bizarre|
//|  23890098|[shlykov,, a, har...|    love-hate|
//|  23890098|[shlykov,, a, har...|relationship,|
//|  23890098|[shlykov,, a, har...|          and|
//|  23890098|[shlykov,, a, har...|      despite|
//|  23890098|[shlykov,, a, har...|        their|
//|  23890098|[shlykov,, a, har...|  prejudices,|
//|  23890098|[shlykov,, a, har...|      realize|
//|  23890098|[shlykov,, a, har...|         they|
//+----------+--------------------+-------------+
//only showing top 20 rows

Related Query

More Query from same tag