use of org.apache.spark.ml.feature.StopWordsRemover in project jpmml-sparkml by jpmml.
the class StopWordsRemoverConverter method encodeFeatures.
@Override
public List<Feature> encodeFeatures(SparkMLEncoder encoder) {
StopWordsRemover transformer = getTransformer();
DocumentFeature documentFeature = (DocumentFeature) encoder.getOnlyFeature(transformer.getInputCol());
Pattern pattern = Pattern.compile(documentFeature.getWordSeparatorRE());
DocumentFeature.StopWordSet stopWordSet = new DocumentFeature.StopWordSet(transformer.getCaseSensitive());
String[] stopWords = transformer.getStopWords();
for (String stopWord : stopWords) {
String[] stopTokens = pattern.split(stopWord);
// Skip multi-token stopwords. See https://issues.apache.org/jira/browse/SPARK-18374
if (stopTokens.length > 1) {
continue;
}
if (TermUtil.hasPunctuation(stopWord)) {
throw new IllegalArgumentException(stopWord);
}
stopWordSet.add(stopWord);
}
documentFeature.addStopWordSet(stopWordSet);
return Collections.<Feature>singletonList(documentFeature);
}
Aggregations