Search in sources :

Example 21 with StructureToPolymerChains

use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.

the class MmtfImporterTest method test4.

@Test
public void test4() throws IOException {
    Path p = Paths.get("./src/main/resources/files/test");
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfImporter.importMmcifFiles(p.toString(), sc);
    assertTrue(pdb.count() == 1);
    pdb = pdb.flatMapToPair(new StructureToPolymerChains());
    assertEquals(8, pdb.count());
}
Also used : Path(java.nio.file.Path) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Test(org.junit.Test)

Example 22 with StructureToPolymerChains

use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.

the class MmtfReaderTest method test2.

@Test
public void test2() throws IOException {
    Path p = Paths.get("./src/main/resources/files/test");
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readMmtfFiles(p.toString(), sc);
    assertTrue(pdb.count() == 1);
    pdb = pdb.flatMapToPair(new StructureToPolymerChains());
    assertTrue(pdb.count() == 8);
}
Also used : Path(java.nio.file.Path) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Test(org.junit.Test)

Example 23 with StructureToPolymerChains

use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.

the class KinaseSearch method main.

public static void main(String[] args) throws IOException {
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(KinaseSearch.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // query for human protein-serine/threonine kinases using SIFTS data
    String sql = "SELECT t.pdbid, t.chain FROM sifts.pdb_chain_taxonomy AS t  " + "JOIN sifts.pdb_chain_enzyme AS e ON (t.pdbid = e.pdbid AND t.chain = e.chain) " + "WHERE t.scientific_name = 'Homo sapiens' AND e.ec_number = '2.7.11.1'";
    // read PDB in MMTF format, split into polymer chains and search using
    // PdbJMineSearch
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readReducedSequenceFile(sc).flatMapToPair(new StructureToPolymerChains()).filter(new PdbjMineSearch(sql));
    System.out.println("Number of entries matching query: " + pdb.count());
    sc.close();
}
Also used : PdbjMineSearch(edu.sdsc.mmtf.spark.webfilters.PdbjMineSearch) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) SparkConf(org.apache.spark.SparkConf)

Example 24 with StructureToPolymerChains

use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mm-dev by sbl-sdsc.

the class ShapeTypeDemo method main.

public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfReducedPath();
    if (args.length != 1) {
        System.err.println("Usage: " + ShapeTypeDemo.class.getSimpleName() + " <dataset output file");
        System.exit(1);
    }
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ShapeTypeDemo.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    long start = System.nanoTime();
    // load a representative PDB chain from the 40% seq. identity Blast Clusters
    int sequenceIdentity = 90;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(// extract polymer chains
    new StructureToPolymerChains()).filter(// get representative subset
    new Pisces(sequenceIdentity, 2.5));
    // get a data set with sequence info
    Dataset<Row> seqData = PolymerSequenceExtractor.getDataset(pdb);
    // convert to BioJava data structure
    JavaPairRDD<String, Structure> structures = pdb.mapValues(new StructureToBioJava());
    // calculate shape data and convert to dataset
    JavaRDD<Row> rows = structures.map(t -> getShapeData(t));
    Dataset<Row> data = JavaRDDToDataset.getDataset(rows, "structureChainId", "shape");
    // there are only few symmetric chain, leave them out
    data = data.filter("shape != 'EXCLUDE'");
    // join calculated data with the sequence data
    data = seqData.join(data, "structureChainId").cache();
    data.show(10);
    // create a Word2Vector representation of the protein sequences
    ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
    // create 2-grams
    int n = 2;
    // 25-amino residue window size for Word2Vector
    int windowSize = 25;
    // dimension of feature vector
    int vectorSize = 50;
    data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize).cache();
    // save data in .parquet file
    data.write().mode("overwrite").format("parquet").save(args[0]);
    long end = System.nanoTime();
    System.out.println((end - start) / 1E9 + " sec.");
    sc.close();
}
Also used : ProteinSequenceEncoder(edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) StructureToBioJava(edu.sdsc.mmtf.spark.mappers.StructureToBioJava) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) Structure(org.biojava.nbio.structure.Structure) SparkConf(org.apache.spark.SparkConf)

Example 25 with StructureToPolymerChains

use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mm-dev by sbl-sdsc.

the class DemoAllVsAll method main.

public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfReducedPath();
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(DemoAllVsAll.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // Read PDB and create a Pisces non-redundant set at 20% sequence identity and a resolution better than 1.6 A.
    // Then take a 1% random sample.
    double fraction = 0.01;
    // optional command line argument
    if (args.length == 1) {
        fraction = Double.parseDouble(args[0]);
    }
    long seed = 123;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(20, 1.6)).sample(false, fraction, seed);
    System.out.println(pdb.count());
    // run the structural alignment
    String algorithmName = FatCatRigid.algorithmName;
    Dataset<Row> alignments = StructureAligner.getAllVsAllAlignments(pdb, algorithmName).cache();
    // show results
    int count = (int) alignments.count();
    alignments.show(count);
    System.out.println("Pairs: " + count);
    long end = System.nanoTime();
    System.out.println("Time per alignment: " + TimeUnit.NANOSECONDS.toMillis((end - start) / count) + " msec.");
    System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
    sc.close();
}
Also used : StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Aggregations

StructureToPolymerChains (edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains)26 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)23 SparkConf (org.apache.spark.SparkConf)22 StructureDataInterface (org.rcsb.mmtf.api.StructureDataInterface)22 Row (org.apache.spark.sql.Row)18 Pisces (edu.sdsc.mmtf.spark.webfilters.Pisces)15 ProteinSequenceEncoder (edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder)10 Path (java.nio.file.Path)3 Test (org.junit.Test)3 ContainsLProteinChain (edu.sdsc.mmtf.spark.filters.ContainsLProteinChain)2 StructureToBioJava (edu.sdsc.mmtf.spark.mappers.StructureToBioJava)2 PolymerComposition (edu.sdsc.mmtf.spark.filters.PolymerComposition)1 PdbjMineSearch (edu.sdsc.mmtf.spark.webfilters.PdbjMineSearch)1 SequenceSimilarity (edu.sdsc.mmtf.spark.webfilters.SequenceSimilarity)1 JavaDoubleRDD (org.apache.spark.api.java.JavaDoubleRDD)1 SparkSession (org.apache.spark.sql.SparkSession)1 StructField (org.apache.spark.sql.types.StructField)1 StructType (org.apache.spark.sql.types.StructType)1 Structure (org.biojava.nbio.structure.Structure)1