use of org.apache.spark.ml.feature.NGram in project mm-dev by sbl-sdsc.
the class CathClassificationDataset method sequenceToFeatureVector.
private static Dataset<Row> sequenceToFeatureVector(Dataset<Row> data, int n, int windowSize, int vectorSize) {
// split sequence into an array of one-letter codes (1-grams)
// e.g. IDCGHVDSL => [i, d, c, g, h, v...
RegexTokenizer tokenizer = new RegexTokenizer().setInputCol("sequence").setOutputCol("1gram").setPattern("(?!^)");
// create n-grams out of the sequence
// e.g., 2-gram [i, d, c, g, h, v... => [i d, d c, c g, g...
NGram ngrammer = new NGram().setN(n).setInputCol("1gram").setOutputCol("ngram");
// convert n-grams to W2V feature vector
// [i d, d c, c g, g... => [0.1234, 0.23948, ...]
Word2Vec word2Vec = new Word2Vec().setInputCol("ngram").setOutputCol("features").setWindowSize(windowSize).setVectorSize(vectorSize).setMinCount(0);
Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { tokenizer, ngrammer, word2Vec });
// .setStages(new PipelineStage[] {tokenizer, word2Vec});
PipelineModel model = pipeline.fit(data);
data = model.transform(data);
return data;
}
use of org.apache.spark.ml.feature.NGram in project jpmml-sparkml by jpmml.
the class NGramConverter method encodeFeatures.
@Override
public List<Feature> encodeFeatures(SparkMLEncoder encoder) {
NGram transformer = getTransformer();
DocumentFeature documentFeature = (DocumentFeature) encoder.getOnlyFeature(transformer.getInputCol());
return Collections.<Feature>singletonList(documentFeature);
}
Aggregations