Search in sources :

Example 1 with Resolution

use of edu.sdsc.mmtf.spark.filters.Resolution in project mmtf-spark by sbl-sdsc.

the class FilterByRFree method main.

public static void main(String[] args) throws FileNotFoundException {
    String path = MmtfReader.getMmtfReducedPath();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByRFree.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // here the methods are chained together
    long count = MmtfReader.readSequenceFile(path, sc).filter(new Resolution(0.0, 2.0)).count();
    System.out.println("# structures: " + count);
    sc.close();
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf) Resolution(edu.sdsc.mmtf.spark.filters.Resolution)

Example 2 with Resolution

use of edu.sdsc.mmtf.spark.filters.Resolution in project mmtf-spark by sbl-sdsc.

the class WriteMmtfCustom method main.

/**
 * @param args
 * @throws FileNotFoundException
 */
public static void main(String[] args) throws FileNotFoundException {
    String path = MmtfReader.getMmtfFullPath();
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(WriteMmtfCustom.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read a 20% random sample of the PDB
    double fraction = 0.2;
    long seed = 123;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, fraction, seed, sc);
    // retain high resolution X-ray structures
    pdb = pdb.filter(new ExperimentalMethods(ExperimentalMethods.X_RAY_DIFFRACTION)).filter(new Resolution(0, 2.0)).filter(new Rfree(0, 0.2));
    // coalesce this into 8 partitions to avoid creating many small files
    pdb = pdb.coalesce(8);
    // save this subset in a Hadoop Sequence file
    MmtfWriter.writeSequenceFile(path + "_xray", sc, pdb);
    System.out.println("# structures in custom set: " + pdb.count());
    long end = System.nanoTime();
    System.out.println("Time: " + (end - start) / 1E9 + "sec.");
    sc.close();
}
Also used : ExperimentalMethods(edu.sdsc.mmtf.spark.filters.ExperimentalMethods) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) SparkConf(org.apache.spark.SparkConf) Rfree(edu.sdsc.mmtf.spark.filters.Rfree) Resolution(edu.sdsc.mmtf.spark.filters.Resolution)

Example 3 with Resolution

use of edu.sdsc.mmtf.spark.filters.Resolution in project mmtf-spark by sbl-sdsc.

the class FilterByResolution method main.

public static void main(String[] args) throws FileNotFoundException {
    String path = MmtfReader.getMmtfReducedPath();
    long start = System.nanoTime();
    // instantiate Spark. Each Spark application needs these two lines of code.
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByResolution.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read entire PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
    // filter PDB entries resolution. Entries without resolution values,
    // e.g., NMR structures, will be filtered out as well.
    pdb = pdb.filter(new Resolution(0.0, 2.0));
    System.out.println("# structures: " + pdb.count());
    // close Spark
    sc.close();
    long end = System.nanoTime();
    System.out.println((end - start) / 1E9 + " sec.");
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) SparkConf(org.apache.spark.SparkConf) Resolution(edu.sdsc.mmtf.spark.filters.Resolution)

Example 4 with Resolution

use of edu.sdsc.mmtf.spark.filters.Resolution in project mmtf-spark by sbl-sdsc.

the class WaterInteractions method main.

public static void main(String[] args) throws IOException, ParseException {
    String timeStamp = new SimpleDateFormat("yyyyMMdd_HHmm").format(Calendar.getInstance().getTime());
    long start = System.nanoTime();
    // process command line options (defaults are provided)
    CommandLine cmd = getCommandLine(args);
    String outputPath = cmd.getOptionValue("output-path");
    System.out.println(outputPath);
    String resolution = cmd.getOptionValue("resolution", "2");
    String minInteractions = cmd.getOptionValue("min-interactions", "2");
    String maxInteractions = cmd.getOptionValue("max-interactions", "4");
    String distanceCutoff = cmd.getOptionValue("distance-cutoff", "3");
    String bFactorCutoff = cmd.getOptionValue("b-factor-cutoff", "1.645");
    boolean includeWaters = cmd.hasOption("include-waters");
    // get path to MMTF Hadoop Sequence file
    String path = MmtfReader.getMmtfFullPath();
    // initialize Spark
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(WaterInteractions.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read PDB structures and filter by resolution and only include proteins
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).filter(new Resolution(0.0, Float.parseFloat(resolution))).filter(new ContainsLProteinChain(true));
    // setup interaction criteria
    InteractionFilter filter = new InteractionFilter();
    filter.setDistanceCutoff(Float.parseFloat(distanceCutoff));
    filter.setNormalizedbFactorCutoff(Float.parseFloat(bFactorCutoff));
    filter.setMinInteractions(Integer.parseInt(minInteractions));
    filter.setMaxInteractions(Integer.parseInt(maxInteractions));
    filter.setQueryGroups(true, "HOH");
    // only use water oxygen
    filter.setQueryElements(true, "O");
    filter.setTargetElements(true, "O", "N", "S");
    // exclude "uninteresting" ligands
    Set<String> prohibitedGroups = new HashSet<>();
    prohibitedGroups.addAll(ExcludedLigandSets.ALL_GROUPS);
    if (!includeWaters) {
        prohibitedGroups.add("HOH");
    }
    filter.setProhibitedTargetGroups(prohibitedGroups);
    // calculate interactions
    Dataset<Row> data = GroupInteractionExtractor.getInteractions(pdb, filter);
    // keep only interactions with at least one organic ligand and one protein interaction
    data = filterBridgingWaterInteractions(data, maxInteractions).cache();
    // show some results
    data.show(50);
    System.out.println("Hits(all): " + data.count());
    // save interactions to a .parquet file
    String waterTag = includeWaters ? "_w" : "";
    String filename = outputPath + "/water_pl" + "_r" + resolution + "_d" + distanceCutoff + "_b" + bFactorCutoff + "_i" + minInteractions + maxInteractions + waterTag + "_" + timeStamp + ".parquet";
    System.out.println("Saving results to: " + filename);
    data.coalesce(1).write().mode("overwrite").format("parquet").save(filename);
    // exit Spark
    sc.close();
    long end = System.nanoTime();
    System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
Also used : StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) InteractionFilter(edu.sdsc.mmtf.spark.interactions.InteractionFilter) CommandLine(org.apache.commons.cli.CommandLine) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SimpleDateFormat(java.text.SimpleDateFormat) SparkConf(org.apache.spark.SparkConf) ContainsLProteinChain(edu.sdsc.mmtf.spark.filters.ContainsLProteinChain) Resolution(edu.sdsc.mmtf.spark.filters.Resolution) HashSet(java.util.HashSet)

Example 5 with Resolution

use of edu.sdsc.mmtf.spark.filters.Resolution in project mm-dev by sbl-sdsc.

the class ArgLigandInteractions method main.

public static void main(String[] args) throws IOException, ParseException {
    String timeStamp = new SimpleDateFormat("yyyyMMdd_HHmm").format(Calendar.getInstance().getTime());
    long start = System.nanoTime();
    // process command line options (defaults are provided)
    CommandLine cmd = getCommandLine(args);
    String outputPath = cmd.getOptionValue("output-path");
    System.out.println(outputPath);
    String resolution = cmd.getOptionValue("resolution", "2");
    String minInteractions = cmd.getOptionValue("min-interactions", "2");
    String maxInteractions = cmd.getOptionValue("max-interactions", "4");
    String distanceCutoff = cmd.getOptionValue("distance-cutoff", "3");
    String bFactorCutoff = cmd.getOptionValue("b-factor-cutoff", "1.645");
    boolean includeWaters = cmd.hasOption("include-waters");
    // get path to MMTF Hadoop Sequence file
    String path = MmtfReader.getMmtfFullPath();
    // initialize Spark
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ArgLigandInteractions.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read PDB structures and filter by resolution and only include proteins
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).filter(new Resolution(0.0, Float.parseFloat(resolution))).filter(new ContainsLProteinChain(true));
    // setup interaction criteria
    InteractionFilter filter = new InteractionFilter();
    filter.setDistanceCutoff(Float.parseFloat(distanceCutoff));
    filter.setNormalizedbFactorCutoff(Float.parseFloat(bFactorCutoff));
    filter.setMinInteractions(Integer.parseInt(minInteractions));
    filter.setMaxInteractions(Integer.parseInt(maxInteractions));
    filter.setQueryGroups(true, "ARG");
    // only use water oxygen
    filter.setQueryElements(true, "N");
    filter.setTargetElements(true, "O", "N", "S");
    filter.setTargetGroups(false, new HashSet<>(PolymerComposition.AMINO_ACIDS_20));
    // exclude "uninteresting" ligands
    Set<String> prohibitedGroups = new HashSet<>();
    prohibitedGroups.addAll(ExcludedLigandSets.ALL_GROUPS);
    if (!includeWaters) {
        prohibitedGroups.add("HOH");
    }
    filter.setProhibitedTargetGroups(prohibitedGroups);
    // calculate interactions
    Dataset<Row> data = GroupInteractionExtractor.getInteractions(pdb, filter);
    // only consider interactions with ARG sidechain nitrogens
    data = data.filter("atom0 = 'NE' OR atom0 = 'NH1' OR atom0 = 'NH2'");
    // the interacting group should be an organic ligand (LGO)
    data = data.filter("type1 = 'LGO'");
    data = data.select("pdbId", "atom0", "groupNum0", "chain0", "atom1", "group1", "groupNum1", "chain1", "distance1");
    // data.show(50);
    Dataset<Row> data2 = data;
    Dataset<Row> joint = data.join(data2, (data.col("pdbId").equalTo(data2.col("pdbId"))).and(data.col("atom0").notEqual(data2.col("atom0"))).and(data.col("groupNum1").equalTo(data2.col("groupNum1")).and(data.col("chain1").equalTo(data2.col("chain1")).and(data.col("atom1").notEqual(data2.col("atom1"))))));
    joint.show(100);
    // data = data.select("pdbId",
    // "atom0", "groupNum0", "chain0",
    // "atom1", "groupNum1", "chain1", "distance1",
    // "atom2", "groupNum2", "chain2", "distance2");
    // 
    // // only consider interactions with ARG sidechain nitrogens
    // data = data.filter("atom0 = 'NE' OR atom0 = 'NH1' OR atom0 = 'NH2'");
    // 
    // // the interacting group should be an organic ligand (LGO)
    // data = data.filter("type1 = 'LGO' AND type2 = 'LGO'").cache();
    // 
    // // the two interacting atoms must come from the same group and chain
    // data = data.filter("group1 = group2 AND groupNum1 = groupNum2 AND chain1 = chain2");
    // Dataset<Row> data2 = data;
    // Dataset<Row> joint = data.join(data2,
    // data.col("pdbId").equalTo(data2.col("pdbId")).and
    // (data.col("groupNum1").equalTo(data2.col("groupNum1")).and
    // (data.col("chain1").equalTo(data2.col("chain1")))
    // ));
    // joint.show(100);
    // RelationalGroupedDataset groupBy = data.groupBy("pdbId", "groupNum0", "chain0", "group1", "groupNum1");
    // groupBy.count().show(1000);
    // show some results
    // data.show(50);
    // System.out.println("Hits(all): " + data.count());
    // 
    // // save interactions to a .parquet file
    // String waterTag = includeWaters ? "_w" : "";
    // String filename = outputPath + "/arg_lig" + "_r" + resolution
    // + "_d" + distanceCutoff
    // + "_b" + bFactorCutoff + "_i" + minInteractions + maxInteractions + waterTag + "_" + timeStamp + ".parquet";
    // System.out.println("Saving results to: " + filename);
    // data.coalesce(1).write().mode("overwrite").format("parquet").save(filename);
    // exit Spark
    sc.close();
    long end = System.nanoTime();
    System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
Also used : StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) InteractionFilter(edu.sdsc.mmtf.spark.interactions.InteractionFilter) CommandLine(org.apache.commons.cli.CommandLine) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SimpleDateFormat(java.text.SimpleDateFormat) SparkConf(org.apache.spark.SparkConf) ContainsLProteinChain(edu.sdsc.mmtf.spark.filters.ContainsLProteinChain) Resolution(edu.sdsc.mmtf.spark.filters.Resolution) HashSet(java.util.HashSet)

Aggregations

Resolution (edu.sdsc.mmtf.spark.filters.Resolution)5 SparkConf (org.apache.spark.SparkConf)5 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)5 StructureDataInterface (org.rcsb.mmtf.api.StructureDataInterface)4 ContainsLProteinChain (edu.sdsc.mmtf.spark.filters.ContainsLProteinChain)2 InteractionFilter (edu.sdsc.mmtf.spark.interactions.InteractionFilter)2 SimpleDateFormat (java.text.SimpleDateFormat)2 HashSet (java.util.HashSet)2 CommandLine (org.apache.commons.cli.CommandLine)2 Row (org.apache.spark.sql.Row)2 ExperimentalMethods (edu.sdsc.mmtf.spark.filters.ExperimentalMethods)1 Rfree (edu.sdsc.mmtf.spark.filters.Rfree)1