score:-1

Accepted answer

i got the solution i was able to do so.

 import scala.beans.beaninfo
 import org.apache.spark.{sparkconf, sparkcontext}
 import org.apache.spark.ml.pipeline
 import org.apache.spark.ml.classification.logisticregression
 import org.apache.spark.ml.feature.{hashingtf, tokenizer}
 import org.apache.spark.mllib.linalg.vector
 import org.apache.spark.sql.{row, sqlcontext}
 import org.apache.spark.mllib.linalg.vectors
 import org.apache.spark.ml.attribute.nominalattribute
 import org.apache.spark.sql.row
 import org.apache.spark.sql.types.{structtype,structfield,stringtype}
 case class labeleddocument(userid: double, date: string, label: double)
 val trainingdata = spark.read.option("inferschema", true).csv("/root/predictiondata10.csv").todf("userid","date","label").todf().as[labeleddocument]
 import org.apache.spark.ml.feature.stringindexer
 import org.apache.spark.ml.feature.vectorassembler
 val dateindexer = new stringindexer().setinputcol("date").setoutputcol("datecat")
 val indexed = dateindexer.fit(trainingdata).transform(trainingdata)
 val assembler = new vectorassembler().setinputcols(array("datecat", "userid")).setoutputcol("rawfeatures")
 val output = assembler.transform(indexed)
 val rows = output.select("userid","date","label","datecat","rawfeatures").collect()
 val astuple=rows.map(a=>(a.getint(0),a.getstring(1),a.getdouble(2),a.getdouble(3),a(4).tostring()))
 val r2 = sc.parallelize(astuple).todf("userid","date","label","datecat","rawfeatures")
 val array(training, testdata) = r2.randomsplit(array(0.7, 0.3))
 import org.apache.spark.ml.feature.{hashingtf, tokenizer}
 val tokenizer = new tokenizer().setinputcol("rawfeatures").setoutputcol("words")
 val hashingtf = new hashingtf().setnumfeatures(1000).setinputcol(tokenizer.getoutputcol).setoutputcol("features")
 import org.apache.spark.ml.regression.linearregression
 val lr = new linearregression().setmaxiter(100).setregparam(0.001).setelasticnetparam(0.0001)
 val pipeline = new pipeline().setstages(array(tokenizer, hashingtf, lr))
 val model = pipeline.fit(training.todf())
 model.transform(testdata.todf()).show()

Related Query

More Query from same tag