Examples with ProteinSequenceEncoder - edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder

Example 1 with ProteinSequenceEncoder

use of edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder in project mmtf-spark by sbl-sdsc.

the class ProteinFoldDatasetCreator method main.

/**
 * @param args
 * @throws IOException
 * @throws StructureException
 */
public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfReducedPath();
    if (args.length != 1) {
        System.err.println("Usage: " + ProteinFoldDatasetCreator.class.getSimpleName() + " <dataset output file");
        System.exit(1);
    }
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ProteinFoldDatasetCreator.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read MMTF Hadoop sequence file and create a non-redundant Pisces
    // subset set (<=20% seq. identity) of L-protein chains
    int sequenceIdentity = 20;
    double resolution = 3.0;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
    // get secondary structure content
    Dataset<Row> data = SecondaryStructureExtractor.getDataset(pdb);
    // classify chains by secondary structure type
    double minThreshold = 0.05;
    double maxThreshold = 0.15;
    data = addProteinFoldType(data, minThreshold, maxThreshold);
    // create a binary classification dataset
    data = data.filter("foldType = 'alpha' OR foldType = 'beta'").cache();
    // create a three-state classification model (alpha, beta, alpha+beta)
    // data = data.filter("foldType != 'other'").cache();
    // add Word2Vec encoded feature vector
    ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
    int n = 2;
    int windowSize = 11;
    int vectorSize = 50;
    data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);
    data.printSchema();
    data.show(25);
    // keep only a subset of relevant fields for further processing
    data = data.select("structureChainId", "alpha", "beta", "coil", "foldType", "features");
    data.write().mode("overwrite").format("parquet").save(args[0]);
    long end = System.nanoTime();
    System.out.println((end - start) / 1E9 + " sec");
}

Also used : ProteinSequenceEncoder(edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 2 with ProteinSequenceEncoder

use of edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder in project mmtf-spark by sbl-sdsc.

the class SecondaryStructureElementsWord2VecEncoder method main.

public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfReducedPath();
    if (args.length != 0 && args.length != 2) {
        System.err.println("Usage: " + SecondaryStructureElementsWord2VecEncoder.class.getSimpleName() + " [<outputFilePath> + <fileFormat>]");
        System.exit(1);
    }
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureWord2VecEncoder.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read MMTF Hadoop sequence file and create a non-redundant Pisces
    // subset set (<=20% seq. identity) of L-protein chains
    int sequenceIdentity = 20;
    double resolution = 3.0;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
    int segmentLength = 11;
    // extract helical sequence segments
    Dataset<Row> data = SecondaryStructureElementExtractor.getDataset(pdb, "H", segmentLength);
    System.out.println(data.count());
    data.show(10, false);
    // add Word2Vec encoded feature vector
    ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
    int n = 2;
    int windowSize = (segmentLength - 1) / 2;
    int vectorSize = 50;
    data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);
    data.show(50, false);
    // optionally, save results
    if (args.length > 0) {
        if (args[1].equals("json")) {
            // coalesce data into a single file
            data = data.coalesce(1);
        }
        data.write().mode("overwrite").format(args[1]).save(args[0]);
    }
    long end = System.nanoTime();
    System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}

Example 3 with ProteinSequenceEncoder

use of edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder in project mmtf-spark by sbl-sdsc.

the class SecondaryStructureShiftedWord2VecEncoder method main.

/**
 * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet)
 * @throws IOException
 * @throws StructureException
 */
public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfReducedPath();
    if (args.length != 2) {
        System.err.println("Usage: " + SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
        System.exit(1);
    }
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read MMTF Hadoop sequence file and create a non-redundant set (<=20% seq. identity)
    // of L-protein chains
    int sequenceIdentity = 20;
    double resolution = 3.0;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
    // get content
    int segmentLength = 11;
    Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);
    // create a Word2Vector representation of the protein sequences
    ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
    int windowSize = (segmentLength - 1) / 2;
    // dimension of feature vector	(50)
    int vectorSize = 50;
    data = encoder.shifted3GramWord2VecEncode(windowSize, vectorSize).cache();
    data.printSchema();
    data.show(25, false);
    if (args[1].equals("json")) {
        // coalesce data into a single file
        data = data.coalesce(1);
    }
    data.write().mode("overwrite").format(args[1]).save(args[0]);
    long end = System.nanoTime();
    System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}

Example 4 with ProteinSequenceEncoder

use of edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder in project mmtf-spark by sbl-sdsc.

the class SecondaryStructureWord2VecModelEncoder method main.

/**
 * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet), args[3] word2VecModelFile
 * @throws IOException
 * @throws StructureException
 */
public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfFullPath();
    if (args.length != 3) {
        System.err.println("Usage: " + SecondaryStructureWord2VecModelEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat> + <word2VecModelFile>");
        System.exit(1);
    }
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureWord2VecModelEncoder.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read MMTF Hadoop sequence file and create a non-redundant Pisces
    // subset set (<=20% seq. identity) of L-protein chains
    int sequenceIdentity = 20;
    double resolution = 3.0;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
    // get content
    int segmentLength = 11;
    Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);
    // add Word2Vec encoded feature vector using
    // a pre-trained Word2Vec model read from file
    ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
    int n = 2;
    String modelFileName = args[2];
    data = encoder.overlappingNgramWord2VecEncode(modelFileName, n).cache();
    data.printSchema();
    data.show(25, false);
    if (args[1].equals("json")) {
        // coalesce data into a single file
        data = data.coalesce(1);
    }
    data.write().mode("overwrite").format(args[1]).save(args[0]);
    long end = System.nanoTime();
    System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}

Example 5 with ProteinSequenceEncoder

use of edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder in project mmtf-spark by sbl-sdsc.

the class PdbSequenceToWord2Vec method main.

public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfReducedPath();
    if (args.length != 1) {
        System.err.println("Usage: " + PdbSequenceToWord2Vec.class.getSimpleName() + " <outputFileName>");
        System.exit(1);
    }
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureWord2VecEncoder.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read MMTF Hadoop sequence file and create a non-redundant Pisces
    // subset set (<=40% seq. identity) of L-protein chains
    int sequenceIdentity = 40;
    double resolution = 3.0;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
    Dataset<Row> data = PolymerSequenceExtractor.getDataset(pdb);
    data.show(10, false);
    // length of polymer sequence segment (number of residues)
    int segmentLength = 11;
    // add Word2Vec encoded feature vector
    ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
    // size of n-grams
    int n = 2;
    int windowSize = (segmentLength - 1) / 2;
    // dimension of vector
    int vectorSize = 50;
    data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);
    encoder.getWord2VecModel().save(args[0]);
    long end = System.nanoTime();
    System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}

Aggregations

ProteinSequenceEncoder (edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder)11 Row (org.apache.spark.sql.Row)11 StructureToPolymerChains (edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains)10 Pisces (edu.sdsc.mmtf.spark.webfilters.Pisces)10 SparkConf (org.apache.spark.SparkConf)10 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)10 StructureDataInterface (org.rcsb.mmtf.api.StructureDataInterface)10 StructureToBioJava (edu.sdsc.mmtf.spark.mappers.StructureToBioJava)1 Structure (org.biojava.nbio.structure.Structure)1