use of org.rcsb.mmtf.api.StructureDataInterface in project mm-dev by sbl-sdsc.
the class SwissModelDatasetToStructure method main.
public static void main(String[] args) throws IOException {
SparkSession spark = SparkSession.builder().master("local[*]").appName(SwissModelDatasetToStructure.class.getSimpleName()).getOrCreate();
JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
List<String> uniProtIds = Arrays.asList("P36575", "P24539", "O00244", "P18846", "Q9UII2");
Dataset<Row> ds = SwissModelDataset.getSwissModels(uniProtIds);
ds.show();
ds = ds.filter("qmean > -2.5 AND coverage > 0.5");
List<String> urls = ds.select("coordinates").as(Encoders.STRING()).collectAsList();
System.out.println(urls);
JavaPairRDD<String, StructureDataInterface> models = MmtfImporter.downloadSwissModelsByUrls(urls, sc);
models.foreach(t -> System.out.println(t._2.getEntitySequence(0)));
spark.close();
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mm-dev by sbl-sdsc.
the class TestRosettaMmtf method main.
/**
* Test: Read MMTF-Hadoop Sequence file.
*
* @param args args[0] <path-to-mmtf-haddop-sequence-file>
*
* @throws FileNotFoundException
*/
public static void main(String[] args) throws FileNotFoundException {
// instantiate Spark
// TODO set to local[1] !!!!
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("TestSwissModelMmtf");
JavaSparkContext sc = new JavaSparkContext(conf);
long start = System.nanoTime();
// read PDB files recursively starting the specified directory
JavaPairRDD<String, StructureDataInterface> structures = MmtfReader.readSequenceFile(args[0], sc);
// total: 639 structures
// structures = structures.filter(new ContainsDnaChain()); // ?
// structures = structures.filter(new ContainsLProteinChain()); // 639?
// structures = structures.filter(new ContainsGroup("ZN")); // 0
// structures = structures.filter(new ContainsGroup("ATP")); //
// debug: print structure data
// structures.foreach(t -> TraverseStructureHierarchy.demo(t._2));
// structures.foreach(t -> System.out.println(t._1));
System.out.println(structures.map(t -> t._2.getNumEntities()).reduce((a, b) -> a + b));
System.out.println("Number of structures read: " + structures.count());
long end = System.nanoTime();
System.out.println("Time: " + (end - start) / 1E9 + " sec.");
// close Spark
sc.close();
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureExtractor method getSecStructFractions.
private static Row getSecStructFractions(Tuple2<String, StructureDataInterface> t) throws Exception {
String key = t._1;
StructureDataInterface structure = t._2;
if (t._2.getNumChains() != 1) {
throw new IllegalArgumentException("This method can only be applied to single polymer chain.");
}
StringBuilder dsspQ8 = new StringBuilder(structure.getEntitySequence(0).length());
StringBuilder dsspQ3 = new StringBuilder(structure.getEntitySequence(0).length());
float helix = 0;
float sheet = 0;
float coil = 0;
int dsspIndex = 0;
int structureIndex = 0;
int seqIndex;
for (int code : structure.getSecStructList()) {
seqIndex = structure.getGroupSequenceIndices()[structureIndex++];
while (dsspIndex < seqIndex) {
dsspQ8.append("X");
dsspQ3.append("X");
dsspIndex++;
}
dsspQ8.append(DsspSecondaryStructure.getDsspCode(code).getOneLetterCode());
dsspIndex++;
switch(DsspSecondaryStructure.getQ3Code(code)) {
case ALPHA_HELIX:
helix++;
dsspQ3.append("H");
break;
case EXTENDED:
sheet++;
dsspQ3.append("E");
break;
case COIL:
coil++;
dsspQ3.append("C");
break;
default:
break;
}
}
while (dsspIndex < structure.getEntitySequence(0).length()) {
dsspQ8.append("X");
dsspQ3.append("X");
dsspIndex++;
}
int n = structure.getSecStructList().length;
helix /= n;
sheet /= n;
coil /= n;
return RowFactory.create(key, structure.getEntitySequence(0), helix, sheet, coil, dsspQ8.toString(), dsspQ3.toString());
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class AtpInteractionAnalysis method main.
/**
* @param args input arguments
* @throws IOException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfFullPath();
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(AtpInteractionAnalysis.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read PDB in MMTF format
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
// filter by sequence identity subset
int sequenceIdentity = 20;
double resolution = 2.0;
pdb = pdb.filter(new Pisces(sequenceIdentity, resolution));
// find ATP interactions within 3 Angstroms
GroupInteractionExtractor finder = new GroupInteractionExtractor("ATP", 3);
Dataset<Row> interactions = finder.getDataset(pdb).cache();
// TODO add a line to only analyze interactions
// with the oxygens in the terminal phosphate group of ATP
// (O1G, O2G, O3G)
// Tip: Google SQL LIKE
interactions = interactions.filter("atom1 LIKE('O%G')");
// show the data schema of the dataset and some data
interactions.printSchema();
interactions.show(20);
long n = interactions.count();
System.out.println("# interactions: " + n);
System.out.println("Top interacting groups");
Dataset<Row> topGroups = interactions.groupBy("residue2").count();
topGroups.sort(// sort descending by count
col("count").desc()).show(10);
System.out.println("Top interacting group/atoms types");
Dataset<Row> topGroupsAndAtoms = interactions.groupBy("residue2", "atom2").count();
topGroupsAndAtoms.withColumn("frequency", // add column with frequency of occurrence
col("count").divide(n)).sort(// sort descending
col("frequency").desc()).show(10);
long end = System.nanoTime();
System.out.println("Time: " + (end - start) / 1E9 + "sec.");
sc.close();
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class InteractionAnalysisAdvanced method main.
/**
* @param args no input arguments
* @throws IOException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfFullPath();
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisAdvanced.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read PDB in MMTF format
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
// get non-redundant subset
pdb = pdb.filter(new Pisces(40, 2.5));
// find Zinc interactions within 3 Angstroms
GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3);
Dataset<Row> interactions = finder.getDataset(pdb).cache();
// show the data schema of the dataset and some data
interactions.printSchema();
interactions.show(20);
long n = interactions.count();
System.out.println("# interactions: " + n);
System.out.println("Top interacting groups");
Dataset<Row> topGroups = interactions.groupBy("residue2").count();
topGroups.sort(// sort descending by count
col("count").desc()).show(10);
System.out.println("Top interacting group/atoms types");
Dataset<Row> topGroupsAndAtoms = interactions.filter(// exclude carbon interactions
"element2 != 'C'").groupBy("residue2", "atom2").count();
topGroupsAndAtoms.withColumn("frequency", // add column with frequency of occurrence
col("count").divide(n)).filter(// filter out occurrences < 1 %
"frequency > 0.01").sort(// sort descending
col("frequency").desc()).show(20);
// TODO print the top 10 interacting elements
System.out.println("Top interacting elements");
Dataset<Row> topElements = interactions.filter(// exclude carbon interactions
"element2 != 'C'").groupBy("element2").count();
topElements.withColumn("frequency", col("count").divide(n)).filter(// filter out occurrences < 1 %
"frequency > 0.01").sort(// sort descending
col("frequency").desc()).show(10);
interactions.groupBy("element2").avg("distance").sort("avg(distance)").show(10);
// Aggregate multiple statistics
// Note: import static org.apache.spark.sql.functions.* required!
// e.g. org.apache.spark.sql.functions.avg
// for a list of all available functions
interactions.groupBy("element2").agg(count("distance"), avg("distance"), min("distance"), max("distance"), kurtosis("distance")).show(10);
long end = System.nanoTime();
System.out.println("Time: " + (end - start) / 1E9 + "sec.");
sc.close();
}
Aggregations