Search in sources :

Example 96 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mm-dev by sbl-sdsc.

the class ImportPdbFiles method main.

/**
 * Converts a directory containing Rosetta-style PDB files into an MMTF-Hadoop Sequence file.
 * The input directory is traversed recursively to find PDB files.
 *
 * <p> Example files from Gremlin website:
 * https://gremlin2.bakerlab.org/meta/aah4043_final.zip
 *
 * @param args args[0] <path-to-pdb_files>, args[1] <path-to-mmtf-hadoop-file>
 *
 * @throws FileNotFoundException
 */
public static void main(String[] args) throws FileNotFoundException {
    // instantiate Spark
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("ImportPdbFiles");
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read PDB files recursively starting the specified directory
    JavaPairRDD<String, StructureDataInterface> structures = MmtfImporter.importPdbFiles(args[0], sc);
    structures.foreach(t -> System.out.println(t._1));
    System.out.println("Number of structures read: " + structures.count());
    // structures.foreach(t -> TraverseStructureHierarchy.demo(t._2));
    // save as an MMTF-Hadoop Sequence File
    // MmtfWriter.writeSequenceFile(mmtfPath, sc, structures);
    // close Spark
    sc.close();
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) SparkConf(org.apache.spark.SparkConf)

Example 97 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mm-dev by sbl-sdsc.

the class ShapeTypeDemo method main.

public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfReducedPath();
    if (args.length != 1) {
        System.err.println("Usage: " + ShapeTypeDemo.class.getSimpleName() + " <dataset output file");
        System.exit(1);
    }
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ShapeTypeDemo.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    long start = System.nanoTime();
    // load a representative PDB chain from the 40% seq. identity Blast Clusters
    int sequenceIdentity = 90;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(// extract polymer chains
    new StructureToPolymerChains()).filter(// get representative subset
    new Pisces(sequenceIdentity, 2.5));
    // get a data set with sequence info
    Dataset<Row> seqData = PolymerSequenceExtractor.getDataset(pdb);
    // convert to BioJava data structure
    JavaPairRDD<String, Structure> structures = pdb.mapValues(new StructureToBioJava());
    // calculate shape data and convert to dataset
    JavaRDD<Row> rows = structures.map(t -> getShapeData(t));
    Dataset<Row> data = JavaRDDToDataset.getDataset(rows, "structureChainId", "shape");
    // there are only few symmetric chain, leave them out
    data = data.filter("shape != 'EXCLUDE'");
    // join calculated data with the sequence data
    data = seqData.join(data, "structureChainId").cache();
    data.show(10);
    // create a Word2Vector representation of the protein sequences
    ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
    // create 2-grams
    int n = 2;
    // 25-amino residue window size for Word2Vector
    int windowSize = 25;
    // dimension of feature vector
    int vectorSize = 50;
    data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize).cache();
    // save data in .parquet file
    data.write().mode("overwrite").format("parquet").save(args[0]);
    long end = System.nanoTime();
    System.out.println((end - start) / 1E9 + " sec.");
    sc.close();
}
Also used : ProteinSequenceEncoder(edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) StructureToBioJava(edu.sdsc.mmtf.spark.mappers.StructureToBioJava) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) Structure(org.biojava.nbio.structure.Structure) SparkConf(org.apache.spark.SparkConf)

Example 98 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mm-dev by sbl-sdsc.

the class ReducedEncoderNewTest method test.

@Test
public void test() {
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByRFree.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // List<String> pdbIds = Arrays.asList("1STP","4HHB","2ONX","1JLP","5X6H","5L2G","2MK1");
    List<String> pdbIds = Arrays.asList("1STP", "4HHB", "2ONX", "2CCV");
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache();
    // pdb.foreach(t -> System.out.println(t._1 + "o :" + t._2.getNumBonds()));
    // List<String> chainIds = pdb.map(t -> t._1 + "_chainId_" + Arrays.toString(t._2.getChainIds())).collect();
    // System.out.println("full: " + chainIds);
    // List<String> chainNames = pdb.map(t -> t._1 + "_chainNames_" + Arrays.toString(t._2.getChainNames())).collect();
    // System.out.println("full: " + chainNames);
    // List<String> numGroups = pdb.map(t -> t._1 + "_numGroups_" + t._2.getNumGroups()).collect();
    // System.out.println("full: " + numGroups);
    // List<String> altlocs = pdb.map(t -> t._1 + "_altLocs_" + Arrays.toString(t._2.getAltLocIds())).collect();
    // System.out.println("full: " + altlocs);
    pdb = pdb.mapValues(v -> ReducedEncoder.getReduced(v)).cache();
    // chainIds = pdb.map(t -> t._1 + "_chainId_" + Arrays.toString(t._2.getChainIds())).collect();
    // System.out.println("reduced: " + chainIds);
    // chainNames = pdb.map(t -> t._1 + "_chainNames_" + Arrays.toString(t._2.getChainNames())).collect();
    // System.out.println("reduced: " + chainNames);
    // altlocs = pdb.map(t -> t._1 + "_altLocs_" + Arrays.toString(t._2.getAltLocIds())).collect();
    // System.out.println("reduced: " + altlocs);
    // 1STP # groups 121 CA + 1 BTN = 122
    // 4HHB # groups 141x2 + 146x2 CA +  4 HEM + 2P (from PO4) = 580
    // 2ONX # groups 4 CA = 4
    // 2CVV # atoms 99 CA + 4 altloc CA + 1 A2G (sugar) + 1 NAG (orig 15) + 1 GOL + 1 ZN, 1 ACE = 108
    // TODO (4 altlocs missing?)
    // numGroups = pdb.map(t -> t._1 + "_numGroups_" + t._2.getNumGroups()).collect();
    // System.out.println("reduced: " + numGroups);
    List<String> atoms = pdb.map(t -> t._1 + "_atoms_" + t._2.getNumAtoms()).collect();
    // System.out.println(atoms);
    // 1STP # atoms 121 CA + 16 BTN
    // 4HHB # atom 141x2 + 146x2 CA +  43x4 HEM + 2P (from PO4) = 748
    // 2ONX # atoms 4 CA
    // 2CVV # atoms 99 CA + 4 (5?) altloc CA + 15 A2G (sugar) + 14 NAG (orig 15) + 6 GOL + 1 ZN, ACE 4 = 143
    assertTrue(atoms.contains("1STP_atoms_137"));
    assertTrue(atoms.contains("4HHB_atoms_748"));
    assertTrue(atoms.contains("2ONX_atoms_4"));
    assertTrue(atoms.contains("2CCV_atoms_143"));
    List<String> bonds = pdb.map(t -> t._1 + "_bonds_" + t._2.getNumBonds()).collect();
    // 1STP # bond 17 BTN
    // 4HHB # bonds 50 x 4 HEM = 200
    // 2ONX # bonds 0
    // 2CVV # bonds 15 A2G+ 14 NAG (-O) + 5 GOL + 3 ACE + 2 disulfide bridges + 1 covalent bond to NAG = 40
    assertTrue(bonds.contains("1STP_bonds_17"));
    assertTrue(bonds.contains("4HHB_bonds_200"));
    assertTrue(bonds.contains("2ONX_bonds_0"));
    assertTrue(bonds.contains("2CCV_bonds_40"));
    sc.close();
}
Also used : Arrays(java.util.Arrays) List(java.util.List) ReducedEncoder(org.rcsb.mmtf.encoder.ReducedEncoder) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) FilterByRFree(edu.sdsc.mmtf.spark.filters.demos.FilterByRFree) SparkConf(org.apache.spark.SparkConf) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) MmtfReader(edu.sdsc.mmtf.spark.io.MmtfReader) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) FilterByRFree(edu.sdsc.mmtf.spark.filters.demos.FilterByRFree) SparkConf(org.apache.spark.SparkConf) Test(org.junit.Test)

Example 99 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mm-dev by sbl-sdsc.

the class DemoAllVsAll method main.

public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfReducedPath();
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(DemoAllVsAll.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // Read PDB and create a Pisces non-redundant set at 20% sequence identity and a resolution better than 1.6 A.
    // Then take a 1% random sample.
    double fraction = 0.01;
    // optional command line argument
    if (args.length == 1) {
        fraction = Double.parseDouble(args[0]);
    }
    long seed = 123;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(20, 1.6)).sample(false, fraction, seed);
    System.out.println(pdb.count());
    // run the structural alignment
    String algorithmName = FatCatRigid.algorithmName;
    Dataset<Row> alignments = StructureAligner.getAllVsAllAlignments(pdb, algorithmName).cache();
    // show results
    int count = (int) alignments.count();
    alignments.show(count);
    System.out.println("Pairs: " + count);
    long end = System.nanoTime();
    System.out.println("Time per alignment: " + TimeUnit.NANOSECONDS.toMillis((end - start) / count) + " msec.");
    System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
    sc.close();
}
Also used : StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 100 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mm-dev by sbl-sdsc.

the class Driver1 method main.

public static void main(String[] args) throws IOException {
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(Driver1.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    long start = System.nanoTime();
    // download query structure
    // List<String> queryId = Arrays.asList("2O9U");
    List<String> queryId = Arrays.asList("1STP");
    JavaPairRDD<String, StructureDataInterface> query = MmtfReader.downloadReducedMmtfFiles(queryId, sc).flatMapToPair(new StructureToPolymerChains(false, true));
    // Examples similar: 4N6T, 2CH9, 3UL5, 3KVP
    // Examples dissimilar: 5O5I, 1STP,
    // List<String> targetId = Arrays.asList("4N6T", "2CH9", "3UL5", "3KVP", "1STP", "5O5I");
    List<String> targetId = Arrays.asList("4OKA");
    JavaPairRDD<String, StructureDataInterface> target = MmtfReader.downloadReducedMmtfFiles(targetId, sc).flatMapToPair(new StructureToPolymerChains(false, true));
    // two standard algorithms
    // String alignmentAlgorithm = CeMain.algorithmName;
    // String alignmentAlgorithm = FatCatRigid.algorithmName;
    String alignmentAlgorithm = "exhaustive";
    // calculate alignments
    Dataset<Row> alignments = StructureAligner.getQueryVsAllAlignments(query, target, alignmentAlgorithm).cache();
    alignments.coalesce(1).write().mode("overwrite").format("csv").save(args[0]);
    // show results
    int count = (int) alignments.count();
    alignments.sort(col("tm").desc()).show(count);
    System.out.println("Pairs: " + count);
    long end = System.nanoTime();
    System.out.println("Time per alignment: " + TimeUnit.NANOSECONDS.toMillis((end - start) / count) + " msec.");
    System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
    sc.close();
}
Also used : StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Aggregations

StructureDataInterface (org.rcsb.mmtf.api.StructureDataInterface)102 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)60 SparkConf (org.apache.spark.SparkConf)58 Row (org.apache.spark.sql.Row)27 StructureToPolymerChains (edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains)22 Test (org.junit.Test)20 Pisces (edu.sdsc.mmtf.spark.webfilters.Pisces)19 ArrayList (java.util.ArrayList)12 ProteinSequenceEncoder (edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder)10 ColumnarStructure (edu.sdsc.mmtf.spark.utils.ColumnarStructure)10 Tuple2 (scala.Tuple2)9 Path (java.nio.file.Path)7 HashSet (java.util.HashSet)7 AdapterToStructureData (org.rcsb.mmtf.encoder.AdapterToStructureData)7 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)6 ContainsLProteinChain (edu.sdsc.mmtf.spark.filters.ContainsLProteinChain)5 List (java.util.List)5 Resolution (edu.sdsc.mmtf.spark.filters.Resolution)4 MmtfReader (edu.sdsc.mmtf.spark.io.MmtfReader)4 File (java.io.File)4