use of edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder in project mmtf-spark by sbl-sdsc.
the class ProteinFoldDatasetCreator method main.
/**
* @param args
* @throws IOException
* @throws StructureException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 1) {
System.err.println("Usage: " + ProteinFoldDatasetCreator.class.getSimpleName() + " <dataset output file");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ProteinFoldDatasetCreator.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant Pisces
// subset set (<=20% seq. identity) of L-protein chains
int sequenceIdentity = 20;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
// get secondary structure content
Dataset<Row> data = SecondaryStructureExtractor.getDataset(pdb);
// classify chains by secondary structure type
double minThreshold = 0.05;
double maxThreshold = 0.15;
data = addProteinFoldType(data, minThreshold, maxThreshold);
// create a binary classification dataset
data = data.filter("foldType = 'alpha' OR foldType = 'beta'").cache();
// create a three-state classification model (alpha, beta, alpha+beta)
// data = data.filter("foldType != 'other'").cache();
// add Word2Vec encoded feature vector
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
int n = 2;
int windowSize = 11;
int vectorSize = 50;
data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);
data.printSchema();
data.show(25);
// keep only a subset of relevant fields for further processing
data = data.select("structureChainId", "alpha", "beta", "coil", "foldType", "features");
data.write().mode("overwrite").format("parquet").save(args[0]);
long end = System.nanoTime();
System.out.println((end - start) / 1E9 + " sec");
}
use of edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureElementsWord2VecEncoder method main.
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 0 && args.length != 2) {
System.err.println("Usage: " + SecondaryStructureElementsWord2VecEncoder.class.getSimpleName() + " [<outputFilePath> + <fileFormat>]");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureWord2VecEncoder.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant Pisces
// subset set (<=20% seq. identity) of L-protein chains
int sequenceIdentity = 20;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
int segmentLength = 11;
// extract helical sequence segments
Dataset<Row> data = SecondaryStructureElementExtractor.getDataset(pdb, "H", segmentLength);
System.out.println(data.count());
data.show(10, false);
// add Word2Vec encoded feature vector
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
int n = 2;
int windowSize = (segmentLength - 1) / 2;
int vectorSize = 50;
data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);
data.show(50, false);
// optionally, save results
if (args.length > 0) {
if (args[1].equals("json")) {
// coalesce data into a single file
data = data.coalesce(1);
}
data.write().mode("overwrite").format(args[1]).save(args[0]);
}
long end = System.nanoTime();
System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
use of edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureShiftedWord2VecEncoder method main.
/**
* @param args args[0] outputFilePath, args[1] outputFormat (json|parquet)
* @throws IOException
* @throws StructureException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 2) {
System.err.println("Usage: " + SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant set (<=20% seq. identity)
// of L-protein chains
int sequenceIdentity = 20;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
// get content
int segmentLength = 11;
Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);
// create a Word2Vector representation of the protein sequences
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
int windowSize = (segmentLength - 1) / 2;
// dimension of feature vector (50)
int vectorSize = 50;
data = encoder.shifted3GramWord2VecEncode(windowSize, vectorSize).cache();
data.printSchema();
data.show(25, false);
if (args[1].equals("json")) {
// coalesce data into a single file
data = data.coalesce(1);
}
data.write().mode("overwrite").format(args[1]).save(args[0]);
long end = System.nanoTime();
System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
use of edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureWord2VecModelEncoder method main.
/**
* @param args args[0] outputFilePath, args[1] outputFormat (json|parquet), args[3] word2VecModelFile
* @throws IOException
* @throws StructureException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfFullPath();
if (args.length != 3) {
System.err.println("Usage: " + SecondaryStructureWord2VecModelEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat> + <word2VecModelFile>");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureWord2VecModelEncoder.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant Pisces
// subset set (<=20% seq. identity) of L-protein chains
int sequenceIdentity = 20;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
// get content
int segmentLength = 11;
Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);
// add Word2Vec encoded feature vector using
// a pre-trained Word2Vec model read from file
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
int n = 2;
String modelFileName = args[2];
data = encoder.overlappingNgramWord2VecEncode(modelFileName, n).cache();
data.printSchema();
data.show(25, false);
if (args[1].equals("json")) {
// coalesce data into a single file
data = data.coalesce(1);
}
data.write().mode("overwrite").format(args[1]).save(args[0]);
long end = System.nanoTime();
System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
use of edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder in project mmtf-spark by sbl-sdsc.
the class PdbSequenceToWord2Vec method main.
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 1) {
System.err.println("Usage: " + PdbSequenceToWord2Vec.class.getSimpleName() + " <outputFileName>");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureWord2VecEncoder.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant Pisces
// subset set (<=40% seq. identity) of L-protein chains
int sequenceIdentity = 40;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
Dataset<Row> data = PolymerSequenceExtractor.getDataset(pdb);
data.show(10, false);
// length of polymer sequence segment (number of residues)
int segmentLength = 11;
// add Word2Vec encoded feature vector
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
// size of n-grams
int n = 2;
int windowSize = (segmentLength - 1) / 2;
// dimension of vector
int vectorSize = 50;
data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);
encoder.getWord2VecModel().save(args[0]);
long end = System.nanoTime();
System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
Aggregations