Search in sources :

Example 76 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class ReadMmtfReduced method main.

public static void main(String[] args) throws FileNotFoundException {
    String path = MmtfReader.getMmtfReducedPath();
    // instantiate Spark. Each Spark application needs these two lines of code.
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ReadMmtfReduced.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read list of PDB entries from a local Hadoop sequence file
    List<String> pdbIds = Arrays.asList("1AQ1", "1B38", "1B39", "1BUH");
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, pdbIds, sc);
    System.out.println("# structures: " + pdb.count());
    // close Spark
    sc.close();
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) SparkConf(org.apache.spark.SparkConf)

Example 77 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class DSSPDemo method main.

public static void main(String[] args) throws IOException {
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // single protein chain
    List<String> pdbIds = Arrays.asList("1STP");
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache();
    pdb = pdb.flatMapToPair(new StructureToPolymerChains());
    Dataset<Row> ds = SecondaryStructureExtractor.getDataset(pdb);
    // show the schema of this dataset
    ds.printSchema();
    ds.show(2, false);
    long end = System.nanoTime();
    System.out.println("Time:     " + (end - start) / 1E9 + "sec.");
    sc.close();
}
Also used : StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 78 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class WaterInteractions method main.

public static void main(String[] args) throws IOException, ParseException {
    String timeStamp = new SimpleDateFormat("yyyyMMdd_HHmm").format(Calendar.getInstance().getTime());
    long start = System.nanoTime();
    // process command line options (defaults are provided)
    CommandLine cmd = getCommandLine(args);
    String outputPath = cmd.getOptionValue("output-path");
    System.out.println(outputPath);
    String resolution = cmd.getOptionValue("resolution", "2");
    String minInteractions = cmd.getOptionValue("min-interactions", "2");
    String maxInteractions = cmd.getOptionValue("max-interactions", "4");
    String distanceCutoff = cmd.getOptionValue("distance-cutoff", "3");
    String bFactorCutoff = cmd.getOptionValue("b-factor-cutoff", "1.645");
    boolean includeWaters = cmd.hasOption("include-waters");
    // get path to MMTF Hadoop Sequence file
    String path = MmtfReader.getMmtfFullPath();
    // initialize Spark
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(WaterInteractions.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read PDB structures and filter by resolution and only include proteins
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).filter(new Resolution(0.0, Float.parseFloat(resolution))).filter(new ContainsLProteinChain(true));
    // setup interaction criteria
    InteractionFilter filter = new InteractionFilter();
    filter.setDistanceCutoff(Float.parseFloat(distanceCutoff));
    filter.setNormalizedbFactorCutoff(Float.parseFloat(bFactorCutoff));
    filter.setMinInteractions(Integer.parseInt(minInteractions));
    filter.setMaxInteractions(Integer.parseInt(maxInteractions));
    filter.setQueryGroups(true, "HOH");
    // only use water oxygen
    filter.setQueryElements(true, "O");
    filter.setTargetElements(true, "O", "N", "S");
    // exclude "uninteresting" ligands
    Set<String> prohibitedGroups = new HashSet<>();
    prohibitedGroups.addAll(ExcludedLigandSets.ALL_GROUPS);
    if (!includeWaters) {
        prohibitedGroups.add("HOH");
    }
    filter.setProhibitedTargetGroups(prohibitedGroups);
    // calculate interactions
    Dataset<Row> data = GroupInteractionExtractor.getInteractions(pdb, filter);
    // keep only interactions with at least one organic ligand and one protein interaction
    data = filterBridgingWaterInteractions(data, maxInteractions).cache();
    // show some results
    data.show(50);
    System.out.println("Hits(all): " + data.count());
    // save interactions to a .parquet file
    String waterTag = includeWaters ? "_w" : "";
    String filename = outputPath + "/water_pl" + "_r" + resolution + "_d" + distanceCutoff + "_b" + bFactorCutoff + "_i" + minInteractions + maxInteractions + waterTag + "_" + timeStamp + ".parquet";
    System.out.println("Saving results to: " + filename);
    data.coalesce(1).write().mode("overwrite").format("parquet").save(filename);
    // exit Spark
    sc.close();
    long end = System.nanoTime();
    System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
Also used : StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) InteractionFilter(edu.sdsc.mmtf.spark.interactions.InteractionFilter) CommandLine(org.apache.commons.cli.CommandLine) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SimpleDateFormat(java.text.SimpleDateFormat) SparkConf(org.apache.spark.SparkConf) ContainsLProteinChain(edu.sdsc.mmtf.spark.filters.ContainsLProteinChain) Resolution(edu.sdsc.mmtf.spark.filters.Resolution) HashSet(java.util.HashSet)

Example 79 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class SecondaryStructureElementDemo method main.

public static void main(String[] args) throws IOException {
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // single protein chain
    List<String> pdbIds = Arrays.asList("1STP");
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadReducedMmtfFiles(pdbIds, sc).cache();
    pdb = pdb.flatMapToPair(new StructureToPolymerChains()).filter(new ContainsLProteinChain());
    Dataset<Row> ds = SecondaryStructureElementExtractor.getDataset(pdb, "E", 6);
    // show the top 50 rows of this dataset
    ds.show(50, false);
    long end = System.nanoTime();
    System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
    sc.close();
}
Also used : StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf) ContainsLProteinChain(edu.sdsc.mmtf.spark.filters.ContainsLProteinChain)

Example 80 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class SecondaryStructureSegmentDemo method main.

public static void main(String[] args) throws IOException {
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(CustomReportDemo.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // single protein chain
    List<String> pdbIds = Arrays.asList("1STP");
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadReducedMmtfFiles(pdbIds, sc).cache();
    pdb = pdb.flatMapToPair(new StructureToPolymerChains()).filter(new ContainsLProteinChain());
    int segmentLength = 25;
    Dataset<Row> ds = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);
    // show the top 50 rows of this dataset
    ds.show(50, false);
    long end = System.nanoTime();
    System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
    sc.close();
}
Also used : StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf) ContainsLProteinChain(edu.sdsc.mmtf.spark.filters.ContainsLProteinChain)

Aggregations

StructureDataInterface (org.rcsb.mmtf.api.StructureDataInterface)102 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)60 SparkConf (org.apache.spark.SparkConf)58 Row (org.apache.spark.sql.Row)27 StructureToPolymerChains (edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains)22 Test (org.junit.Test)20 Pisces (edu.sdsc.mmtf.spark.webfilters.Pisces)19 ArrayList (java.util.ArrayList)12 ProteinSequenceEncoder (edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder)10 ColumnarStructure (edu.sdsc.mmtf.spark.utils.ColumnarStructure)10 Tuple2 (scala.Tuple2)9 Path (java.nio.file.Path)7 HashSet (java.util.HashSet)7 AdapterToStructureData (org.rcsb.mmtf.encoder.AdapterToStructureData)7 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)6 ContainsLProteinChain (edu.sdsc.mmtf.spark.filters.ContainsLProteinChain)5 List (java.util.List)5 Resolution (edu.sdsc.mmtf.spark.filters.Resolution)4 MmtfReader (edu.sdsc.mmtf.spark.io.MmtfReader)4 File (java.io.File)4