score:2

one way is to cast all columns to string. note that i'm changing the r.get(idx) to r.getstring(idx) in your code. the below works.

scala> val df = seq(("servicecent4","ap-1-ioo-ppp","241.206.155.172","06-12-18:17:42:34",162,53,1544098354885l)).todf("col1","col2","col3","eventtime","col4","col5","col6")
df: org.apache.spark.sql.dataframe = [col1: string, col2: string ... 5 more fields]

scala> df.show(1,false)
+------------+------------+---------------+-----------------+----+----+-------------+
|col1        |col2        |col3           |eventtime        |col4|col5|col6         |
+------------+------------+---------------+-----------------+----+----+-------------+
|servicecent4|ap-1-ioo-ppp|241.206.155.172|06-12-18:17:42:34|162 |53  |1544098354885|
+------------+------------+---------------+-----------------+----+----+-------------+
only showing top 1 row

scala> df.printschema
root
 |-- col1: string (nullable = true)
 |-- col2: string (nullable = true)
 |-- col3: string (nullable = true)
 |-- eventtime: string (nullable = true)
 |-- col4: integer (nullable = false)
 |-- col5: integer (nullable = false)
 |-- col6: long (nullable = false)


scala> val schema = df.schema
schema: org.apache.spark.sql.types.structtype = structtype(structfield(col1,stringtype,true), structfield(col2,stringtype,true), structfield(col3,stringtype,true), structfield(eventtime,stringtype,true), structfield(col4,integertype,false), structfield(col5,integertype,false), structfield(col6,longtype,false))

scala> val df2 = df.columns.foldleft(df){ (acc,r) => acc.withcolumn(r,col(r).cast("string")) }
df2: org.apache.spark.sql.dataframe = [col1: string, col2: string ... 5 more fields]

scala> df2.printschema
root
 |-- col1: string (nullable = true)
 |-- col2: string (nullable = true)
 |-- col3: string (nullable = true)
 |-- eventtime: string (nullable = true)
 |-- col4: string (nullable = false)
 |-- col5: string (nullable = false)
 |-- col6: string (nullable = false)


scala> val x = df2.flatmap(r => (0 until schema.length).map { idx => ((idx, r.getstring(idx)), 1l) } )
x: org.apache.spark.sql.dataset[((int, string), long)] = [_1: struct<_1: int, _2: string>, _2: bigint]

scala> x.show(5,false)
+---------------------+---+
|_1                   |_2 |
+---------------------+---+
|[0,servicecent4]     |1  |
|[1,ap-1-ioo-ppp]     |1  |
|[2,241.206.155.172]  |1  |
|[3,06-12-18:17:42:34]|1  |
|[4,162]              |1  |
+---------------------+---+
only showing top 5 rows


scala>

Related Query

More Query from same tag