Search in sources :

Example 6 with StructureToPolymerChains

use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.

the class SequenceSimilarityDemo method main.

/**
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SequenceSimilarityDemo.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    String sequence = "NLVQFGVMIEKMTGKSALQYNDYGCYCGIGGSHWPVDQ";
    double eValueCutoff = 0.001;
    int sequenceIdentityCutoff = 40;
    boolean maskLowComplexity = true;
    // read PDB in MMTF format, split into polymer chains,
    // search by sequence similarity, and print sequences found
    MmtfReader.readReducedSequenceFile(sc).flatMapToPair(new StructureToPolymerChains(false, true)).filter(new SequenceSimilarity(sequence, SequenceSimilarity.BLAST, eValueCutoff, sequenceIdentityCutoff, maskLowComplexity)).foreach(t -> System.out.println(t._1 + ": " + t._2.getEntitySequence(0)));
    sc.close();
}
Also used : SequenceSimilarity(edu.sdsc.mmtf.spark.webfilters.SequenceSimilarity) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf)

Example 7 with StructureToPolymerChains

use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.

the class SecondaryStructureWord2VecModelEncoder method main.

/**
 * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet), args[3] word2VecModelFile
 * @throws IOException
 * @throws StructureException
 */
public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfFullPath();
    if (args.length != 3) {
        System.err.println("Usage: " + SecondaryStructureWord2VecModelEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat> + <word2VecModelFile>");
        System.exit(1);
    }
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureWord2VecModelEncoder.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read MMTF Hadoop sequence file and create a non-redundant Pisces
    // subset set (<=20% seq. identity) of L-protein chains
    int sequenceIdentity = 20;
    double resolution = 3.0;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
    // get content
    int segmentLength = 11;
    Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);
    // add Word2Vec encoded feature vector using
    // a pre-trained Word2Vec model read from file
    ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
    int n = 2;
    String modelFileName = args[2];
    data = encoder.overlappingNgramWord2VecEncode(modelFileName, n).cache();
    data.printSchema();
    data.show(25, false);
    if (args[1].equals("json")) {
        // coalesce data into a single file
        data = data.coalesce(1);
    }
    data.write().mode("overwrite").format(args[1]).save(args[0]);
    long end = System.nanoTime();
    System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
Also used : ProteinSequenceEncoder(edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 8 with StructureToPolymerChains

use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mm-dev by sbl-sdsc.

the class DemoAllVsAll_cluster method main.

public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfReducedPath();
    long start = System.nanoTime();
    SparkConf conf = new SparkConf();
    JavaSparkContext sc = new JavaSparkContext(conf);
    // Read PDB and create a Pisces non-redundant set at 20% sequence identity and a resolution better than 1.6 A.
    // Then take a 1% random sample.
    double fraction = 0.01;
    // optional command line argument
    if (args.length == 1) {
        fraction = Double.parseDouble(args[0]);
    }
    long seed = 123;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(20, 1.6)).sample(false, fraction, seed);
    System.out.println(pdb.count());
    // run the structural alignment
    String algorithmName = FatCatRigid.algorithmName;
    Dataset<Row> alignments = StructureAligner.getAllVsAllAlignments(pdb, algorithmName).cache();
    // show results
    int count = (int) alignments.count();
    alignments.show(count);
    System.out.println("Pairs: " + count);
    long end = System.nanoTime();
    System.out.println("Time per alignment: " + TimeUnit.NANOSECONDS.toMillis((end - start) / count) + " msec.");
    System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
    sc.close();
}
Also used : Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 9 with StructureToPolymerChains

use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mm-dev by sbl-sdsc.

the class DemoQueryVsAll method main.

public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfReducedPath();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(DemoQueryVsAll.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    long start = System.nanoTime();
    // download query structure
    List<String> queryId = Arrays.asList("2W47");
    JavaPairRDD<String, StructureDataInterface> query = MmtfReader.downloadFullMmtfFiles(queryId, sc).flatMapToPair(new StructureToPolymerChains());
    // use a 1 % sample of the PDB and then filter by the Pisces
    // non-redundant set
    // at 20% sequence identity and a resolution better than 1.6 A.
    double fraction = 1.0;
    long seed = 123;
    JavaPairRDD<String, StructureDataInterface> target = MmtfReader.readSequenceFile(path, fraction, seed, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(20, 1.6)).sample(false, 0.08, seed);
    // specialized algorithms
    // String alignmentAlgorithm = CeMain.algorithmName;
    // String alignmentAlgorithm = CeCPMain.algorithmName;
    // String alignmentAlgorithm = FatCatFlexible.algorithmName;
    // two standard algorithms
    // String alignmentAlgorithm = CeMain.algorithmName;
    String alignmentAlgorithm = FatCatRigid.algorithmName;
    // String alignmentAlgorithm = ExhaustiveAligner.alignmentAlgorithm;
    // calculate alignments
    Dataset<Row> alignments = StructureAligner.getQueryVsAllAlignments(query, target, alignmentAlgorithm).cache();
    // show results
    int count = (int) alignments.count();
    alignments.sort(col("tm").desc()).show(count);
    System.out.println("Pairs: " + count);
    long end = System.nanoTime();
    System.out.println("Time per alignment: " + TimeUnit.NANOSECONDS.toMillis((end - start) / count) + " msec.");
    System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
    sc.close();
}
Also used : StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 10 with StructureToPolymerChains

use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mm-dev by sbl-sdsc.

the class CathClassificationDataset method getSequenceData.

private static Dataset<Row> getSequenceData(String[] args) throws IOException {
    SparkSession spark = SparkSession.builder().master("local[*]").getOrCreate();
    JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
    JavaRDD<Row> pdb = MmtfReader.readSequenceFile(args[0], sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(40, 2.0)).map(t -> RowFactory.create(t._1, t._2.getEntitySequence(0)));
    // Generate the schema
    StructType schema = new StructType(new StructField[] { new StructField("structureChainId", DataTypes.StringType, false, Metadata.empty()), new StructField("sequence", DataTypes.StringType, false, Metadata.empty()) });
    // Apply the schema to the RDD
    return spark.createDataFrame(pdb, schema);
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row)

Aggregations

StructureToPolymerChains (edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains)26 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)23 SparkConf (org.apache.spark.SparkConf)22 StructureDataInterface (org.rcsb.mmtf.api.StructureDataInterface)22 Row (org.apache.spark.sql.Row)18 Pisces (edu.sdsc.mmtf.spark.webfilters.Pisces)15 ProteinSequenceEncoder (edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder)10 Path (java.nio.file.Path)3 Test (org.junit.Test)3 ContainsLProteinChain (edu.sdsc.mmtf.spark.filters.ContainsLProteinChain)2 StructureToBioJava (edu.sdsc.mmtf.spark.mappers.StructureToBioJava)2 PolymerComposition (edu.sdsc.mmtf.spark.filters.PolymerComposition)1 PdbjMineSearch (edu.sdsc.mmtf.spark.webfilters.PdbjMineSearch)1 SequenceSimilarity (edu.sdsc.mmtf.spark.webfilters.SequenceSimilarity)1 JavaDoubleRDD (org.apache.spark.api.java.JavaDoubleRDD)1 SparkSession (org.apache.spark.sql.SparkSession)1 StructField (org.apache.spark.sql.types.StructField)1 StructType (org.apache.spark.sql.types.StructType)1 Structure (org.biojava.nbio.structure.Structure)1