use of edu.sdsc.mmtf.spark.filters.Resolution in project mmtf-spark by sbl-sdsc.
the class FilterByRFree method main.
public static void main(String[] args) throws FileNotFoundException {
String path = MmtfReader.getMmtfReducedPath();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByRFree.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// here the methods are chained together
long count = MmtfReader.readSequenceFile(path, sc).filter(new Resolution(0.0, 2.0)).count();
System.out.println("# structures: " + count);
sc.close();
}
use of edu.sdsc.mmtf.spark.filters.Resolution in project mmtf-spark by sbl-sdsc.
the class WriteMmtfCustom method main.
/**
* @param args
* @throws FileNotFoundException
*/
public static void main(String[] args) throws FileNotFoundException {
String path = MmtfReader.getMmtfFullPath();
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(WriteMmtfCustom.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read a 20% random sample of the PDB
double fraction = 0.2;
long seed = 123;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, fraction, seed, sc);
// retain high resolution X-ray structures
pdb = pdb.filter(new ExperimentalMethods(ExperimentalMethods.X_RAY_DIFFRACTION)).filter(new Resolution(0, 2.0)).filter(new Rfree(0, 0.2));
// coalesce this into 8 partitions to avoid creating many small files
pdb = pdb.coalesce(8);
// save this subset in a Hadoop Sequence file
MmtfWriter.writeSequenceFile(path + "_xray", sc, pdb);
System.out.println("# structures in custom set: " + pdb.count());
long end = System.nanoTime();
System.out.println("Time: " + (end - start) / 1E9 + "sec.");
sc.close();
}
use of edu.sdsc.mmtf.spark.filters.Resolution in project mmtf-spark by sbl-sdsc.
the class FilterByResolution method main.
public static void main(String[] args) throws FileNotFoundException {
String path = MmtfReader.getMmtfReducedPath();
long start = System.nanoTime();
// instantiate Spark. Each Spark application needs these two lines of code.
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByResolution.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read entire PDB in MMTF format
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
// filter PDB entries resolution. Entries without resolution values,
// e.g., NMR structures, will be filtered out as well.
pdb = pdb.filter(new Resolution(0.0, 2.0));
System.out.println("# structures: " + pdb.count());
// close Spark
sc.close();
long end = System.nanoTime();
System.out.println((end - start) / 1E9 + " sec.");
}
use of edu.sdsc.mmtf.spark.filters.Resolution in project mmtf-spark by sbl-sdsc.
the class WaterInteractions method main.
public static void main(String[] args) throws IOException, ParseException {
String timeStamp = new SimpleDateFormat("yyyyMMdd_HHmm").format(Calendar.getInstance().getTime());
long start = System.nanoTime();
// process command line options (defaults are provided)
CommandLine cmd = getCommandLine(args);
String outputPath = cmd.getOptionValue("output-path");
System.out.println(outputPath);
String resolution = cmd.getOptionValue("resolution", "2");
String minInteractions = cmd.getOptionValue("min-interactions", "2");
String maxInteractions = cmd.getOptionValue("max-interactions", "4");
String distanceCutoff = cmd.getOptionValue("distance-cutoff", "3");
String bFactorCutoff = cmd.getOptionValue("b-factor-cutoff", "1.645");
boolean includeWaters = cmd.hasOption("include-waters");
// get path to MMTF Hadoop Sequence file
String path = MmtfReader.getMmtfFullPath();
// initialize Spark
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(WaterInteractions.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read PDB structures and filter by resolution and only include proteins
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).filter(new Resolution(0.0, Float.parseFloat(resolution))).filter(new ContainsLProteinChain(true));
// setup interaction criteria
InteractionFilter filter = new InteractionFilter();
filter.setDistanceCutoff(Float.parseFloat(distanceCutoff));
filter.setNormalizedbFactorCutoff(Float.parseFloat(bFactorCutoff));
filter.setMinInteractions(Integer.parseInt(minInteractions));
filter.setMaxInteractions(Integer.parseInt(maxInteractions));
filter.setQueryGroups(true, "HOH");
// only use water oxygen
filter.setQueryElements(true, "O");
filter.setTargetElements(true, "O", "N", "S");
// exclude "uninteresting" ligands
Set<String> prohibitedGroups = new HashSet<>();
prohibitedGroups.addAll(ExcludedLigandSets.ALL_GROUPS);
if (!includeWaters) {
prohibitedGroups.add("HOH");
}
filter.setProhibitedTargetGroups(prohibitedGroups);
// calculate interactions
Dataset<Row> data = GroupInteractionExtractor.getInteractions(pdb, filter);
// keep only interactions with at least one organic ligand and one protein interaction
data = filterBridgingWaterInteractions(data, maxInteractions).cache();
// show some results
data.show(50);
System.out.println("Hits(all): " + data.count());
// save interactions to a .parquet file
String waterTag = includeWaters ? "_w" : "";
String filename = outputPath + "/water_pl" + "_r" + resolution + "_d" + distanceCutoff + "_b" + bFactorCutoff + "_i" + minInteractions + maxInteractions + waterTag + "_" + timeStamp + ".parquet";
System.out.println("Saving results to: " + filename);
data.coalesce(1).write().mode("overwrite").format("parquet").save(filename);
// exit Spark
sc.close();
long end = System.nanoTime();
System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
use of edu.sdsc.mmtf.spark.filters.Resolution in project mm-dev by sbl-sdsc.
the class ArgLigandInteractions method main.
public static void main(String[] args) throws IOException, ParseException {
String timeStamp = new SimpleDateFormat("yyyyMMdd_HHmm").format(Calendar.getInstance().getTime());
long start = System.nanoTime();
// process command line options (defaults are provided)
CommandLine cmd = getCommandLine(args);
String outputPath = cmd.getOptionValue("output-path");
System.out.println(outputPath);
String resolution = cmd.getOptionValue("resolution", "2");
String minInteractions = cmd.getOptionValue("min-interactions", "2");
String maxInteractions = cmd.getOptionValue("max-interactions", "4");
String distanceCutoff = cmd.getOptionValue("distance-cutoff", "3");
String bFactorCutoff = cmd.getOptionValue("b-factor-cutoff", "1.645");
boolean includeWaters = cmd.hasOption("include-waters");
// get path to MMTF Hadoop Sequence file
String path = MmtfReader.getMmtfFullPath();
// initialize Spark
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ArgLigandInteractions.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read PDB structures and filter by resolution and only include proteins
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).filter(new Resolution(0.0, Float.parseFloat(resolution))).filter(new ContainsLProteinChain(true));
// setup interaction criteria
InteractionFilter filter = new InteractionFilter();
filter.setDistanceCutoff(Float.parseFloat(distanceCutoff));
filter.setNormalizedbFactorCutoff(Float.parseFloat(bFactorCutoff));
filter.setMinInteractions(Integer.parseInt(minInteractions));
filter.setMaxInteractions(Integer.parseInt(maxInteractions));
filter.setQueryGroups(true, "ARG");
// only use water oxygen
filter.setQueryElements(true, "N");
filter.setTargetElements(true, "O", "N", "S");
filter.setTargetGroups(false, new HashSet<>(PolymerComposition.AMINO_ACIDS_20));
// exclude "uninteresting" ligands
Set<String> prohibitedGroups = new HashSet<>();
prohibitedGroups.addAll(ExcludedLigandSets.ALL_GROUPS);
if (!includeWaters) {
prohibitedGroups.add("HOH");
}
filter.setProhibitedTargetGroups(prohibitedGroups);
// calculate interactions
Dataset<Row> data = GroupInteractionExtractor.getInteractions(pdb, filter);
// only consider interactions with ARG sidechain nitrogens
data = data.filter("atom0 = 'NE' OR atom0 = 'NH1' OR atom0 = 'NH2'");
// the interacting group should be an organic ligand (LGO)
data = data.filter("type1 = 'LGO'");
data = data.select("pdbId", "atom0", "groupNum0", "chain0", "atom1", "group1", "groupNum1", "chain1", "distance1");
// data.show(50);
Dataset<Row> data2 = data;
Dataset<Row> joint = data.join(data2, (data.col("pdbId").equalTo(data2.col("pdbId"))).and(data.col("atom0").notEqual(data2.col("atom0"))).and(data.col("groupNum1").equalTo(data2.col("groupNum1")).and(data.col("chain1").equalTo(data2.col("chain1")).and(data.col("atom1").notEqual(data2.col("atom1"))))));
joint.show(100);
// data = data.select("pdbId",
// "atom0", "groupNum0", "chain0",
// "atom1", "groupNum1", "chain1", "distance1",
// "atom2", "groupNum2", "chain2", "distance2");
//
// // only consider interactions with ARG sidechain nitrogens
// data = data.filter("atom0 = 'NE' OR atom0 = 'NH1' OR atom0 = 'NH2'");
//
// // the interacting group should be an organic ligand (LGO)
// data = data.filter("type1 = 'LGO' AND type2 = 'LGO'").cache();
//
// // the two interacting atoms must come from the same group and chain
// data = data.filter("group1 = group2 AND groupNum1 = groupNum2 AND chain1 = chain2");
// Dataset<Row> data2 = data;
// Dataset<Row> joint = data.join(data2,
// data.col("pdbId").equalTo(data2.col("pdbId")).and
// (data.col("groupNum1").equalTo(data2.col("groupNum1")).and
// (data.col("chain1").equalTo(data2.col("chain1")))
// ));
// joint.show(100);
// RelationalGroupedDataset groupBy = data.groupBy("pdbId", "groupNum0", "chain0", "group1", "groupNum1");
// groupBy.count().show(1000);
// show some results
// data.show(50);
// System.out.println("Hits(all): " + data.count());
//
// // save interactions to a .parquet file
// String waterTag = includeWaters ? "_w" : "";
// String filename = outputPath + "/arg_lig" + "_r" + resolution
// + "_d" + distanceCutoff
// + "_b" + bFactorCutoff + "_i" + minInteractions + maxInteractions + waterTag + "_" + timeStamp + ".parquet";
// System.out.println("Saving results to: " + filename);
// data.coalesce(1).write().mode("overwrite").format("parquet").save(filename);
// exit Spark
sc.close();
long end = System.nanoTime();
System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
Aggregations