use of org.apache.spark.mllib.feature.HashingTF in project spring-boot-quick by vector4wang.
the class EmailFilter method main.
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("垃圾邮件分类");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> ham = sc.textFile("D:\\githubspace\\springbootquick\\src\\main\\resources\\ham.txt");
JavaRDD<String> spam = sc.textFile("D:\\githubspace\\springbootquick\\src\\main\\resources\\spam.txt");
final HashingTF tf = new HashingTF(10000);
JavaRDD<LabeledPoint> posExamples = spam.map(h -> new LabeledPoint(1, tf.transform(Arrays.asList(h.split(" ")))));
JavaRDD<LabeledPoint> negExamples = ham.map(s -> new LabeledPoint(0, tf.transform(Arrays.asList(s.split(" ")))));
JavaRDD<LabeledPoint> trainingData = posExamples.union(negExamples);
trainingData.cache();
LogisticRegressionWithSGD lrLearner = new LogisticRegressionWithSGD();
LogisticRegressionModel model = lrLearner.run(trainingData.rdd());
Vector posTestExample = tf.transform(Arrays.asList("O M G GET cheap stuff by sending money to ...".split(" ")));
System.out.println(posTestExample.toJson());
Vector negTestExample = tf.transform(Arrays.asList("Hi Dad, I started studying Spark the other ...".split(" ")));
System.out.println("Prediction for positive test example: " + model.predict(posTestExample));
System.out.println("Prediction for negative test example: " + model.predict(negTestExample));
}
use of org.apache.spark.mllib.feature.HashingTF in project cdap by caskdata.
the class NaiveBayesTrainer method run.
@Override
public void run(SparkExecutionPluginContext sparkContext, JavaRDD<StructuredRecord> input) throws Exception {
Preconditions.checkArgument(input.count() != 0, "Input RDD is empty.");
final HashingTF tf = new HashingTF(100);
JavaRDD<LabeledPoint> trainingData = input.map(new Function<StructuredRecord, LabeledPoint>() {
@Override
public LabeledPoint call(StructuredRecord record) throws Exception {
// should never happen, here to test app correctness in unit tests
if (inputSchema != null && !inputSchema.equals(record.getSchema())) {
throw new IllegalStateException("runtime schema does not match what was set at configure time.");
}
String text = record.get(config.fieldToClassify);
return new LabeledPoint((Double) record.get(config.predictionField), tf.transform(Lists.newArrayList(text.split(" "))));
}
});
trainingData.cache();
final NaiveBayesModel model = NaiveBayes.train(trainingData.rdd(), 1.0);
// save the model to a file in the output FileSet
JavaSparkContext javaSparkContext = sparkContext.getSparkContext();
FileSet outputFS = sparkContext.getDataset(config.fileSetName);
model.save(JavaSparkContext.toSparkContext(javaSparkContext), outputFS.getBaseLocation().append(config.path).toURI().getPath());
JavaPairRDD<Long, String> textsToClassify = sparkContext.fromStream(TEXTS_TO_CLASSIFY, String.class);
JavaRDD<Vector> featuresToClassify = textsToClassify.map(new Function<Tuple2<Long, String>, Vector>() {
@Override
public Vector call(Tuple2<Long, String> longWritableTextTuple2) throws Exception {
String text = longWritableTextTuple2._2();
return tf.transform(Lists.newArrayList(text.split(" ")));
}
});
JavaRDD<Double> predict = model.predict(featuresToClassify);
LOG.info("Predictions: {}", predict.collect());
// key the predictions with the message
JavaPairRDD<String, Double> keyedPredictions = textsToClassify.values().zip(predict);
// convert to byte[],byte[] to write to data
JavaPairRDD<byte[], byte[]> bytesRDD = keyedPredictions.mapToPair(new PairFunction<Tuple2<String, Double>, byte[], byte[]>() {
@Override
public Tuple2<byte[], byte[]> call(Tuple2<String, Double> tuple) throws Exception {
return new Tuple2<>(Bytes.toBytes(tuple._1()), Bytes.toBytes(tuple._2()));
}
});
sparkContext.saveAsDataset(bytesRDD, CLASSIFIED_TEXTS);
}
use of org.apache.spark.mllib.feature.HashingTF in project learning-spark by databricks.
the class MLlib method main.
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("JavaBookExample");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
// Load 2 types of emails from text files: spam and ham (non-spam).
// Each line has text from one email.
JavaRDD<String> spam = sc.textFile("files/spam.txt");
JavaRDD<String> ham = sc.textFile("files/ham.txt");
// Create a HashingTF instance to map email text to vectors of 100 features.
final HashingTF tf = new HashingTF(100);
// Each email is split into words, and each word is mapped to one feature.
// Create LabeledPoint datasets for positive (spam) and negative (ham) examples.
JavaRDD<LabeledPoint> positiveExamples = spam.map(new Function<String, LabeledPoint>() {
@Override
public LabeledPoint call(String email) {
return new LabeledPoint(1, tf.transform(Arrays.asList(email.split(" "))));
}
});
JavaRDD<LabeledPoint> negativeExamples = ham.map(new Function<String, LabeledPoint>() {
@Override
public LabeledPoint call(String email) {
return new LabeledPoint(0, tf.transform(Arrays.asList(email.split(" "))));
}
});
JavaRDD<LabeledPoint> trainingData = positiveExamples.union(negativeExamples);
// Cache data since Logistic Regression is an iterative algorithm.
trainingData.cache();
// Create a Logistic Regression learner which uses the LBFGS optimizer.
LogisticRegressionWithSGD lrLearner = new LogisticRegressionWithSGD();
// Run the actual learning algorithm on the training data.
LogisticRegressionModel model = lrLearner.run(trainingData.rdd());
// Test on a positive example (spam) and a negative one (ham).
// First apply the same HashingTF feature transformation used on the training data.
Vector posTestExample = tf.transform(Arrays.asList("O M G GET cheap stuff by sending money to ...".split(" ")));
Vector negTestExample = tf.transform(Arrays.asList("Hi Dad, I started studying Spark the other ...".split(" ")));
// Now use the learned model to predict spam/ham for new emails.
System.out.println("Prediction for positive test example: " + model.predict(posTestExample));
System.out.println("Prediction for negative test example: " + model.predict(negTestExample));
sc.stop();
}
use of org.apache.spark.mllib.feature.HashingTF in project cdap by caskdata.
the class NaiveBayesClassifier method initialize.
@Override
public void initialize(SparkExecutionPluginContext context) throws Exception {
FileSet fileSet = context.getDataset(config.fileSetName);
Location modelLocation = fileSet.getBaseLocation().append(config.path);
if (!modelLocation.exists()) {
throw new IllegalArgumentException(String.format("Failed to find model to use for classification. Location does not exist: %s.", modelLocation));
}
// load the model from a file in the model fileset
JavaSparkContext javaSparkContext = context.getSparkContext();
SparkContext sparkContext = JavaSparkContext.toSparkContext(javaSparkContext);
loadedModel = NaiveBayesModel.load(sparkContext, modelLocation.toURI().getPath());
tf = new HashingTF(100);
}
Aggregations