use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class ReadMmtfReduced method main.
public static void main(String[] args) throws FileNotFoundException {
String path = MmtfReader.getMmtfReducedPath();
// instantiate Spark. Each Spark application needs these two lines of code.
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ReadMmtfReduced.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read list of PDB entries from a local Hadoop sequence file
List<String> pdbIds = Arrays.asList("1AQ1", "1B38", "1B39", "1BUH");
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, pdbIds, sc);
System.out.println("# structures: " + pdb.count());
// close Spark
sc.close();
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class DSSPDemo method main.
public static void main(String[] args) throws IOException {
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// single protein chain
List<String> pdbIds = Arrays.asList("1STP");
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache();
pdb = pdb.flatMapToPair(new StructureToPolymerChains());
Dataset<Row> ds = SecondaryStructureExtractor.getDataset(pdb);
// show the schema of this dataset
ds.printSchema();
ds.show(2, false);
long end = System.nanoTime();
System.out.println("Time: " + (end - start) / 1E9 + "sec.");
sc.close();
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class WaterInteractions method main.
public static void main(String[] args) throws IOException, ParseException {
String timeStamp = new SimpleDateFormat("yyyyMMdd_HHmm").format(Calendar.getInstance().getTime());
long start = System.nanoTime();
// process command line options (defaults are provided)
CommandLine cmd = getCommandLine(args);
String outputPath = cmd.getOptionValue("output-path");
System.out.println(outputPath);
String resolution = cmd.getOptionValue("resolution", "2");
String minInteractions = cmd.getOptionValue("min-interactions", "2");
String maxInteractions = cmd.getOptionValue("max-interactions", "4");
String distanceCutoff = cmd.getOptionValue("distance-cutoff", "3");
String bFactorCutoff = cmd.getOptionValue("b-factor-cutoff", "1.645");
boolean includeWaters = cmd.hasOption("include-waters");
// get path to MMTF Hadoop Sequence file
String path = MmtfReader.getMmtfFullPath();
// initialize Spark
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(WaterInteractions.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read PDB structures and filter by resolution and only include proteins
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).filter(new Resolution(0.0, Float.parseFloat(resolution))).filter(new ContainsLProteinChain(true));
// setup interaction criteria
InteractionFilter filter = new InteractionFilter();
filter.setDistanceCutoff(Float.parseFloat(distanceCutoff));
filter.setNormalizedbFactorCutoff(Float.parseFloat(bFactorCutoff));
filter.setMinInteractions(Integer.parseInt(minInteractions));
filter.setMaxInteractions(Integer.parseInt(maxInteractions));
filter.setQueryGroups(true, "HOH");
// only use water oxygen
filter.setQueryElements(true, "O");
filter.setTargetElements(true, "O", "N", "S");
// exclude "uninteresting" ligands
Set<String> prohibitedGroups = new HashSet<>();
prohibitedGroups.addAll(ExcludedLigandSets.ALL_GROUPS);
if (!includeWaters) {
prohibitedGroups.add("HOH");
}
filter.setProhibitedTargetGroups(prohibitedGroups);
// calculate interactions
Dataset<Row> data = GroupInteractionExtractor.getInteractions(pdb, filter);
// keep only interactions with at least one organic ligand and one protein interaction
data = filterBridgingWaterInteractions(data, maxInteractions).cache();
// show some results
data.show(50);
System.out.println("Hits(all): " + data.count());
// save interactions to a .parquet file
String waterTag = includeWaters ? "_w" : "";
String filename = outputPath + "/water_pl" + "_r" + resolution + "_d" + distanceCutoff + "_b" + bFactorCutoff + "_i" + minInteractions + maxInteractions + waterTag + "_" + timeStamp + ".parquet";
System.out.println("Saving results to: " + filename);
data.coalesce(1).write().mode("overwrite").format("parquet").save(filename);
// exit Spark
sc.close();
long end = System.nanoTime();
System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureElementDemo method main.
public static void main(String[] args) throws IOException {
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// single protein chain
List<String> pdbIds = Arrays.asList("1STP");
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadReducedMmtfFiles(pdbIds, sc).cache();
pdb = pdb.flatMapToPair(new StructureToPolymerChains()).filter(new ContainsLProteinChain());
Dataset<Row> ds = SecondaryStructureElementExtractor.getDataset(pdb, "E", 6);
// show the top 50 rows of this dataset
ds.show(50, false);
long end = System.nanoTime();
System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
sc.close();
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureSegmentDemo method main.
public static void main(String[] args) throws IOException {
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(CustomReportDemo.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// single protein chain
List<String> pdbIds = Arrays.asList("1STP");
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadReducedMmtfFiles(pdbIds, sc).cache();
pdb = pdb.flatMapToPair(new StructureToPolymerChains()).filter(new ContainsLProteinChain());
int segmentLength = 25;
Dataset<Row> ds = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);
// show the top 50 rows of this dataset
ds.show(50, false);
long end = System.nanoTime();
System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
sc.close();
}
Aggregations