use of edu.sdsc.mmtf.spark.filters.ContainsLProteinChain in project mmtf-spark by sbl-sdsc.
the class WaterInteractions method main.
public static void main(String[] args) throws IOException, ParseException {
String timeStamp = new SimpleDateFormat("yyyyMMdd_HHmm").format(Calendar.getInstance().getTime());
long start = System.nanoTime();
// process command line options (defaults are provided)
CommandLine cmd = getCommandLine(args);
String outputPath = cmd.getOptionValue("output-path");
System.out.println(outputPath);
String resolution = cmd.getOptionValue("resolution", "2");
String minInteractions = cmd.getOptionValue("min-interactions", "2");
String maxInteractions = cmd.getOptionValue("max-interactions", "4");
String distanceCutoff = cmd.getOptionValue("distance-cutoff", "3");
String bFactorCutoff = cmd.getOptionValue("b-factor-cutoff", "1.645");
boolean includeWaters = cmd.hasOption("include-waters");
// get path to MMTF Hadoop Sequence file
String path = MmtfReader.getMmtfFullPath();
// initialize Spark
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(WaterInteractions.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read PDB structures and filter by resolution and only include proteins
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).filter(new Resolution(0.0, Float.parseFloat(resolution))).filter(new ContainsLProteinChain(true));
// setup interaction criteria
InteractionFilter filter = new InteractionFilter();
filter.setDistanceCutoff(Float.parseFloat(distanceCutoff));
filter.setNormalizedbFactorCutoff(Float.parseFloat(bFactorCutoff));
filter.setMinInteractions(Integer.parseInt(minInteractions));
filter.setMaxInteractions(Integer.parseInt(maxInteractions));
filter.setQueryGroups(true, "HOH");
// only use water oxygen
filter.setQueryElements(true, "O");
filter.setTargetElements(true, "O", "N", "S");
// exclude "uninteresting" ligands
Set<String> prohibitedGroups = new HashSet<>();
prohibitedGroups.addAll(ExcludedLigandSets.ALL_GROUPS);
if (!includeWaters) {
prohibitedGroups.add("HOH");
}
filter.setProhibitedTargetGroups(prohibitedGroups);
// calculate interactions
Dataset<Row> data = GroupInteractionExtractor.getInteractions(pdb, filter);
// keep only interactions with at least one organic ligand and one protein interaction
data = filterBridgingWaterInteractions(data, maxInteractions).cache();
// show some results
data.show(50);
System.out.println("Hits(all): " + data.count());
// save interactions to a .parquet file
String waterTag = includeWaters ? "_w" : "";
String filename = outputPath + "/water_pl" + "_r" + resolution + "_d" + distanceCutoff + "_b" + bFactorCutoff + "_i" + minInteractions + maxInteractions + waterTag + "_" + timeStamp + ".parquet";
System.out.println("Saving results to: " + filename);
data.coalesce(1).write().mode("overwrite").format("parquet").save(filename);
// exit Spark
sc.close();
long end = System.nanoTime();
System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
use of edu.sdsc.mmtf.spark.filters.ContainsLProteinChain in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureElementDemo method main.
public static void main(String[] args) throws IOException {
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// single protein chain
List<String> pdbIds = Arrays.asList("1STP");
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadReducedMmtfFiles(pdbIds, sc).cache();
pdb = pdb.flatMapToPair(new StructureToPolymerChains()).filter(new ContainsLProteinChain());
Dataset<Row> ds = SecondaryStructureElementExtractor.getDataset(pdb, "E", 6);
// show the top 50 rows of this dataset
ds.show(50, false);
long end = System.nanoTime();
System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
sc.close();
}
use of edu.sdsc.mmtf.spark.filters.ContainsLProteinChain in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureSegmentDemo method main.
public static void main(String[] args) throws IOException {
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(CustomReportDemo.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// single protein chain
List<String> pdbIds = Arrays.asList("1STP");
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadReducedMmtfFiles(pdbIds, sc).cache();
pdb = pdb.flatMapToPair(new StructureToPolymerChains()).filter(new ContainsLProteinChain());
int segmentLength = 25;
Dataset<Row> ds = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);
// show the top 50 rows of this dataset
ds.show(50, false);
long end = System.nanoTime();
System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
sc.close();
}
use of edu.sdsc.mmtf.spark.filters.ContainsLProteinChain in project mm-dev by sbl-sdsc.
the class ArgLigandInteractions method main.
public static void main(String[] args) throws IOException, ParseException {
String timeStamp = new SimpleDateFormat("yyyyMMdd_HHmm").format(Calendar.getInstance().getTime());
long start = System.nanoTime();
// process command line options (defaults are provided)
CommandLine cmd = getCommandLine(args);
String outputPath = cmd.getOptionValue("output-path");
System.out.println(outputPath);
String resolution = cmd.getOptionValue("resolution", "2");
String minInteractions = cmd.getOptionValue("min-interactions", "2");
String maxInteractions = cmd.getOptionValue("max-interactions", "4");
String distanceCutoff = cmd.getOptionValue("distance-cutoff", "3");
String bFactorCutoff = cmd.getOptionValue("b-factor-cutoff", "1.645");
boolean includeWaters = cmd.hasOption("include-waters");
// get path to MMTF Hadoop Sequence file
String path = MmtfReader.getMmtfFullPath();
// initialize Spark
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ArgLigandInteractions.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read PDB structures and filter by resolution and only include proteins
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).filter(new Resolution(0.0, Float.parseFloat(resolution))).filter(new ContainsLProteinChain(true));
// setup interaction criteria
InteractionFilter filter = new InteractionFilter();
filter.setDistanceCutoff(Float.parseFloat(distanceCutoff));
filter.setNormalizedbFactorCutoff(Float.parseFloat(bFactorCutoff));
filter.setMinInteractions(Integer.parseInt(minInteractions));
filter.setMaxInteractions(Integer.parseInt(maxInteractions));
filter.setQueryGroups(true, "ARG");
// only use water oxygen
filter.setQueryElements(true, "N");
filter.setTargetElements(true, "O", "N", "S");
filter.setTargetGroups(false, new HashSet<>(PolymerComposition.AMINO_ACIDS_20));
// exclude "uninteresting" ligands
Set<String> prohibitedGroups = new HashSet<>();
prohibitedGroups.addAll(ExcludedLigandSets.ALL_GROUPS);
if (!includeWaters) {
prohibitedGroups.add("HOH");
}
filter.setProhibitedTargetGroups(prohibitedGroups);
// calculate interactions
Dataset<Row> data = GroupInteractionExtractor.getInteractions(pdb, filter);
// only consider interactions with ARG sidechain nitrogens
data = data.filter("atom0 = 'NE' OR atom0 = 'NH1' OR atom0 = 'NH2'");
// the interacting group should be an organic ligand (LGO)
data = data.filter("type1 = 'LGO'");
data = data.select("pdbId", "atom0", "groupNum0", "chain0", "atom1", "group1", "groupNum1", "chain1", "distance1");
// data.show(50);
Dataset<Row> data2 = data;
Dataset<Row> joint = data.join(data2, (data.col("pdbId").equalTo(data2.col("pdbId"))).and(data.col("atom0").notEqual(data2.col("atom0"))).and(data.col("groupNum1").equalTo(data2.col("groupNum1")).and(data.col("chain1").equalTo(data2.col("chain1")).and(data.col("atom1").notEqual(data2.col("atom1"))))));
joint.show(100);
// data = data.select("pdbId",
// "atom0", "groupNum0", "chain0",
// "atom1", "groupNum1", "chain1", "distance1",
// "atom2", "groupNum2", "chain2", "distance2");
//
// // only consider interactions with ARG sidechain nitrogens
// data = data.filter("atom0 = 'NE' OR atom0 = 'NH1' OR atom0 = 'NH2'");
//
// // the interacting group should be an organic ligand (LGO)
// data = data.filter("type1 = 'LGO' AND type2 = 'LGO'").cache();
//
// // the two interacting atoms must come from the same group and chain
// data = data.filter("group1 = group2 AND groupNum1 = groupNum2 AND chain1 = chain2");
// Dataset<Row> data2 = data;
// Dataset<Row> joint = data.join(data2,
// data.col("pdbId").equalTo(data2.col("pdbId")).and
// (data.col("groupNum1").equalTo(data2.col("groupNum1")).and
// (data.col("chain1").equalTo(data2.col("chain1")))
// ));
// joint.show(100);
// RelationalGroupedDataset groupBy = data.groupBy("pdbId", "groupNum0", "chain0", "group1", "groupNum1");
// groupBy.count().show(1000);
// show some results
// data.show(50);
// System.out.println("Hits(all): " + data.count());
//
// // save interactions to a .parquet file
// String waterTag = includeWaters ? "_w" : "";
// String filename = outputPath + "/arg_lig" + "_r" + resolution
// + "_d" + distanceCutoff
// + "_b" + bFactorCutoff + "_i" + minInteractions + maxInteractions + waterTag + "_" + timeStamp + ".parquet";
// System.out.println("Saving results to: " + filename);
// data.coalesce(1).write().mode("overwrite").format("parquet").save(filename);
// exit Spark
sc.close();
long end = System.nanoTime();
System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
Aggregations