use of org.apache.spark.ml.feature.Word2Vec in project mmtf-spark by sbl-sdsc.
the class ProteinSequenceEncoder method shifted3GramWord2VecEncode.
/**
* Encodes a protein sequence as three non-overlapping 3-grams,
* trains a Word2Vec model on the 3-grams, and then averages
* the three resulting feature vectors.
*
* <P> Asgari E, Mofrad MRK (2015) Continuous Distributed Representation
* of Biological Sequences for Deep Proteomics and Genomics.
* PLOS ONE 10(11): e0141287. doi:
* <a href="https://doi.org/10.1371/journal.pone.0141287">10.1371/journal.pone.0141287</a>
*
* @param windowSize
* width of the window used to slide across the sequence, context
* words from [-window, window]).
* @param vectorSize
* dimension of the feature vector
*
* @return dataset with features vector added to original dataset
*/
public Dataset<Row> shifted3GramWord2VecEncode(int windowSize, int vectorSize) {
// create n-grams out of the sequence
// e.g., 2-gram [IDCGH, ... => [ID, DC, CG, GH, ...
// TODO set input column
data = SequenceNgrammer.shiftedNgram(data, 3, 0, "ngram0");
data = SequenceNgrammer.shiftedNgram(data, 3, 1, "ngram1");
data = SequenceNgrammer.shiftedNgram(data, 3, 2, "ngram2");
Dataset<Row> ngram0 = data.select("ngram0").withColumnRenamed("ngram0", "ngram");
Dataset<Row> ngram1 = data.select("ngram1").withColumnRenamed("ngram1", "ngram");
Dataset<Row> ngram2 = data.select("ngram2").withColumnRenamed("ngram2", "ngram");
Dataset<Row> ngrams = ngram0.union(ngram1).union(ngram2);
// convert n-grams to W2V feature vector
// [I D, D C, C G, G H, ... => [0.1234, 0.2394, ...]
Word2Vec word2Vec = new Word2Vec().setInputCol("ngram").setMinCount(10).setNumPartitions(8).setWindowSize(windowSize).setVectorSize(vectorSize);
model = word2Vec.fit(ngrams);
model.setInputCol("ngram0");
model.setOutputCol("features0");
data = model.transform(data);
model.setInputCol("ngram1");
model.setOutputCol("features1");
data = model.transform(data);
model.setInputCol("ngram2");
model.setOutputCol("features2");
data = model.transform(data);
data = averageFeatureVectors(data, outputCol);
return data;
}
use of org.apache.spark.ml.feature.Word2Vec in project mmtf-spark by sbl-sdsc.
the class ProteinSequenceEncoder method overlappingNgramWord2VecEncode.
/**
* Encodes a protein sequence by converting it into n-grams and
* then transforming it into a Word2Vec feature vector.
*
* @param n
* the number of words in an n-gram
* @param windowSize
* width of the window used to slide across the sequence, context
* words from [-window, window]).
* @param vectorSize
* dimension of the feature vector
*
* @return dataset with features vector added to original dataset
*/
public Dataset<Row> overlappingNgramWord2VecEncode(int n, int windowSize, int vectorSize) {
// create n-grams out of the sequence
// e.g., 2-gram IDCGH, ... => [ID, DC, CG, GH, ...
data = SequenceNgrammer.ngram(data, n, "ngram");
// convert n-grams to W2V feature vector
// [ID, DC, CG, GH, ... => [0.1234, 0.2394, ...]
Word2Vec word2Vec = new Word2Vec();
word2Vec.setInputCol("ngram").setOutputCol(outputCol).setNumPartitions(8).setWindowSize(windowSize).setVectorSize(vectorSize);
model = word2Vec.fit(data);
data = model.transform(data);
return data;
}
use of org.apache.spark.ml.feature.Word2Vec in project mm-dev by sbl-sdsc.
the class CathClassificationDataset method sequenceToFeatureVector.
private static Dataset<Row> sequenceToFeatureVector(Dataset<Row> data, int n, int windowSize, int vectorSize) {
// split sequence into an array of one-letter codes (1-grams)
// e.g. IDCGHVDSL => [i, d, c, g, h, v...
RegexTokenizer tokenizer = new RegexTokenizer().setInputCol("sequence").setOutputCol("1gram").setPattern("(?!^)");
// create n-grams out of the sequence
// e.g., 2-gram [i, d, c, g, h, v... => [i d, d c, c g, g...
NGram ngrammer = new NGram().setN(n).setInputCol("1gram").setOutputCol("ngram");
// convert n-grams to W2V feature vector
// [i d, d c, c g, g... => [0.1234, 0.23948, ...]
Word2Vec word2Vec = new Word2Vec().setInputCol("ngram").setOutputCol("features").setWindowSize(windowSize).setVectorSize(vectorSize).setMinCount(0);
Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { tokenizer, ngrammer, word2Vec });
// .setStages(new PipelineStage[] {tokenizer, word2Vec});
PipelineModel model = pipeline.fit(data);
data = model.transform(data);
return data;
}
Aggregations