use of org.apache.spark.ml.feature.RegexTokenizer in project jpmml-sparkml by jpmml.
the class RegexTokenizerConverter method encodeFeatures.
@Override
public List<Feature> encodeFeatures(SparkMLEncoder encoder) {
RegexTokenizer transformer = getTransformer();
if (!transformer.getGaps()) {
throw new IllegalArgumentException("Expected splitter mode, got token matching mode");
}
if (transformer.getMinTokenLength() != 1) {
throw new IllegalArgumentException("Expected 1 as minimum token length, got " + transformer.getMinTokenLength() + " as minimum token length");
}
Feature feature = encoder.getOnlyFeature(transformer.getInputCol());
Field<?> field = encoder.getField(feature.getName());
if (transformer.getToLowercase()) {
Apply apply = PMMLUtil.createApply("lowercase", feature.ref());
field = encoder.createDerivedField(FeatureUtil.createName("lowercase", feature), OpType.CATEGORICAL, DataType.STRING, apply);
}
return Collections.<Feature>singletonList(new DocumentFeature(encoder, field, transformer.getPattern()));
}
use of org.apache.spark.ml.feature.RegexTokenizer in project mm-dev by sbl-sdsc.
the class CathClassificationDataset method sequenceToFeatureVector.
private static Dataset<Row> sequenceToFeatureVector(Dataset<Row> data, int n, int windowSize, int vectorSize) {
// split sequence into an array of one-letter codes (1-grams)
// e.g. IDCGHVDSL => [i, d, c, g, h, v...
RegexTokenizer tokenizer = new RegexTokenizer().setInputCol("sequence").setOutputCol("1gram").setPattern("(?!^)");
// create n-grams out of the sequence
// e.g., 2-gram [i, d, c, g, h, v... => [i d, d c, c g, g...
NGram ngrammer = new NGram().setN(n).setInputCol("1gram").setOutputCol("ngram");
// convert n-grams to W2V feature vector
// [i d, d c, c g, g... => [0.1234, 0.23948, ...]
Word2Vec word2Vec = new Word2Vec().setInputCol("ngram").setOutputCol("features").setWindowSize(windowSize).setVectorSize(vectorSize).setMinCount(0);
Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { tokenizer, ngrammer, word2Vec });
// .setStages(new PipelineStage[] {tokenizer, word2Vec});
PipelineModel model = pipeline.fit(data);
data = model.transform(data);
return data;
}
Aggregations