Search in sources :

Example 61 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class SecondaryStructureOneHotEncoder method main.

/**
 * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet)
 * @throws IOException
 * @throws StructureException
 */
public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfReducedPath();
    if (args.length < 2) {
        System.err.println("Usage: " + SecondaryStructureOneHotEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat> + [<modelFileName>]");
        System.exit(1);
    }
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureOneHotEncoder.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read MMTF Hadoop sequence file and create a non-redundant Pisces
    // subset set (<=20% seq. identity) of L-protein chains
    int sequenceIdentity = 20;
    double resolution = 3.0;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
    // get content
    int segmentLength = 11;
    Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache();
    System.out.println("original data     : " + data.count());
    data = data.dropDuplicates("labelQ3", "sequence").cache();
    System.out.println("- duplicate Q3/seq: " + data.count());
    data = data.dropDuplicates("sequence").cache();
    System.out.println("- duplicate seq   : " + data.count());
    // add one-hot encoded sequence feature vector to dataset
    ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
    data = encoder.oneHotEncode();
    data.printSchema();
    data.show(25, false);
    if (args[1].equals("json")) {
        // coalesce data into a single file
        data = data.coalesce(1);
    }
    data.write().mode("overwrite").format(args[1]).save(args[0]);
    long end = System.nanoTime();
    System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
Also used : ProteinSequenceEncoder(edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 62 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class SecondaryStructurePropertyEncoder method main.

/**
 * @param args outputFilePath outputFormat (json|parquet)
 * @throws IOException
 * @throws StructureException
 */
public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfReducedPath();
    if (args.length != 2) {
        System.err.println("Usage: " + SecondaryStructurePropertyEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
        System.exit(1);
    }
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructurePropertyEncoder.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read MMTF Hadoop sequence file and create a non-redundant Pisces
    // subset set (<=20% seq. identity) of L-protein chains
    int sequenceIdentity = 20;
    double resolution = 3.0;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
    // get content
    int segmentLength = 11;
    Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache();
    System.out.println("original data     : " + data.count());
    data = data.dropDuplicates("labelQ3", "sequence").cache();
    System.out.println("- duplicate Q3/seq: " + data.count());
    data = data.dropDuplicates("sequence").cache();
    System.out.println("- duplicate seq   : " + data.count());
    // add a property encoded feature vector
    ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
    data = encoder.propertyEncode();
    data.printSchema();
    data.show(25, false);
    if (args[1].equals("json")) {
        // coalesce data into a single file
        data = data.coalesce(1);
    }
    data.write().mode("overwrite").format(args[1]).save(args[0]);
    long end = System.nanoTime();
    System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
Also used : ProteinSequenceEncoder(edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 63 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class SecondaryStructureWord2VecEncoder method main.

/**
 * @param args outputFilePath outputFormat (json|parquet)
 * @throws IOException
 * @throws StructureException
 */
public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfReducedPath();
    if (args.length != 2) {
        System.err.println("Usage: " + SecondaryStructureWord2VecEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
        System.exit(1);
    }
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureWord2VecEncoder.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read MMTF Hadoop sequence file and create a non-redundant Pisces
    // subset set (<=20% seq. identity) of L-protein chains
    int sequenceIdentity = 20;
    double resolution = 3.0;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
    // get content
    int segmentLength = 11;
    Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);
    // add Word2Vec encoded feature vector
    ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
    int n = 2;
    int windowSize = (segmentLength - 1) / 2;
    int vectorSize = 50;
    data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);
    data.printSchema();
    data.show(25, false);
    if (args[1].equals("json")) {
        // coalesce data into a single file
        data = data.coalesce(1);
    }
    data.write().mode("overwrite").format(args[1]).save(args[0]);
    long end = System.nanoTime();
    System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
Also used : ProteinSequenceEncoder(edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 64 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class FilterByResolution method main.

public static void main(String[] args) throws FileNotFoundException {
    String path = MmtfReader.getMmtfReducedPath();
    long start = System.nanoTime();
    // instantiate Spark. Each Spark application needs these two lines of code.
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByResolution.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read entire PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
    // filter PDB entries resolution. Entries without resolution values,
    // e.g., NMR structures, will be filtered out as well.
    pdb = pdb.filter(new Resolution(0.0, 2.0));
    System.out.println("# structures: " + pdb.count());
    // close Spark
    sc.close();
    long end = System.nanoTime();
    System.out.println((end - start) / 1E9 + " sec.");
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) SparkConf(org.apache.spark.SparkConf) Resolution(edu.sdsc.mmtf.spark.filters.Resolution)

Example 65 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class FilterBySequenceRegex method main.

/**
 * @param args
 * @throws FileNotFoundException
 */
public static void main(String[] args) throws FileNotFoundException {
    String path = MmtfReader.getMmtfReducedPath();
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterBySequenceRegex.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
    // find structures that containing a Zinc finger motif
    pdb = pdb.filter(new ContainsSequenceRegex("C.{2,4}C.{12}H.{3,5}H"));
    System.out.println("Number of PDB entries containing a Zinc finger motif: " + pdb.count());
    long end = System.nanoTime();
    System.out.println("Time: " + (end - start) / 1E9 + " sec.");
    sc.close();
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) SparkConf(org.apache.spark.SparkConf) ContainsSequenceRegex(edu.sdsc.mmtf.spark.filters.ContainsSequenceRegex)

Aggregations

StructureDataInterface (org.rcsb.mmtf.api.StructureDataInterface)102 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)60 SparkConf (org.apache.spark.SparkConf)58 Row (org.apache.spark.sql.Row)27 StructureToPolymerChains (edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains)22 Test (org.junit.Test)20 Pisces (edu.sdsc.mmtf.spark.webfilters.Pisces)19 ArrayList (java.util.ArrayList)12 ProteinSequenceEncoder (edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder)10 ColumnarStructure (edu.sdsc.mmtf.spark.utils.ColumnarStructure)10 Tuple2 (scala.Tuple2)9 Path (java.nio.file.Path)7 HashSet (java.util.HashSet)7 AdapterToStructureData (org.rcsb.mmtf.encoder.AdapterToStructureData)7 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)6 ContainsLProteinChain (edu.sdsc.mmtf.spark.filters.ContainsLProteinChain)5 List (java.util.List)5 Resolution (edu.sdsc.mmtf.spark.filters.Resolution)4 MmtfReader (edu.sdsc.mmtf.spark.io.MmtfReader)4 File (java.io.File)4