use of org.rcsb.mmtf.api.StructureDataInterface in project mm-dev by sbl-sdsc.
the class ImportPdbFiles method main.
/**
* Converts a directory containing Rosetta-style PDB files into an MMTF-Hadoop Sequence file.
* The input directory is traversed recursively to find PDB files.
*
* <p> Example files from Gremlin website:
* https://gremlin2.bakerlab.org/meta/aah4043_final.zip
*
* @param args args[0] <path-to-pdb_files>, args[1] <path-to-mmtf-hadoop-file>
*
* @throws FileNotFoundException
*/
public static void main(String[] args) throws FileNotFoundException {
// instantiate Spark
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("ImportPdbFiles");
JavaSparkContext sc = new JavaSparkContext(conf);
// read PDB files recursively starting the specified directory
JavaPairRDD<String, StructureDataInterface> structures = MmtfImporter.importPdbFiles(args[0], sc);
structures.foreach(t -> System.out.println(t._1));
System.out.println("Number of structures read: " + structures.count());
// structures.foreach(t -> TraverseStructureHierarchy.demo(t._2));
// save as an MMTF-Hadoop Sequence File
// MmtfWriter.writeSequenceFile(mmtfPath, sc, structures);
// close Spark
sc.close();
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mm-dev by sbl-sdsc.
the class ShapeTypeDemo method main.
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 1) {
System.err.println("Usage: " + ShapeTypeDemo.class.getSimpleName() + " <dataset output file");
System.exit(1);
}
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ShapeTypeDemo.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
long start = System.nanoTime();
// load a representative PDB chain from the 40% seq. identity Blast Clusters
int sequenceIdentity = 90;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(// extract polymer chains
new StructureToPolymerChains()).filter(// get representative subset
new Pisces(sequenceIdentity, 2.5));
// get a data set with sequence info
Dataset<Row> seqData = PolymerSequenceExtractor.getDataset(pdb);
// convert to BioJava data structure
JavaPairRDD<String, Structure> structures = pdb.mapValues(new StructureToBioJava());
// calculate shape data and convert to dataset
JavaRDD<Row> rows = structures.map(t -> getShapeData(t));
Dataset<Row> data = JavaRDDToDataset.getDataset(rows, "structureChainId", "shape");
// there are only few symmetric chain, leave them out
data = data.filter("shape != 'EXCLUDE'");
// join calculated data with the sequence data
data = seqData.join(data, "structureChainId").cache();
data.show(10);
// create a Word2Vector representation of the protein sequences
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
// create 2-grams
int n = 2;
// 25-amino residue window size for Word2Vector
int windowSize = 25;
// dimension of feature vector
int vectorSize = 50;
data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize).cache();
// save data in .parquet file
data.write().mode("overwrite").format("parquet").save(args[0]);
long end = System.nanoTime();
System.out.println((end - start) / 1E9 + " sec.");
sc.close();
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mm-dev by sbl-sdsc.
the class ReducedEncoderNewTest method test.
@Test
public void test() {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByRFree.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// List<String> pdbIds = Arrays.asList("1STP","4HHB","2ONX","1JLP","5X6H","5L2G","2MK1");
List<String> pdbIds = Arrays.asList("1STP", "4HHB", "2ONX", "2CCV");
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache();
// pdb.foreach(t -> System.out.println(t._1 + "o :" + t._2.getNumBonds()));
// List<String> chainIds = pdb.map(t -> t._1 + "_chainId_" + Arrays.toString(t._2.getChainIds())).collect();
// System.out.println("full: " + chainIds);
// List<String> chainNames = pdb.map(t -> t._1 + "_chainNames_" + Arrays.toString(t._2.getChainNames())).collect();
// System.out.println("full: " + chainNames);
// List<String> numGroups = pdb.map(t -> t._1 + "_numGroups_" + t._2.getNumGroups()).collect();
// System.out.println("full: " + numGroups);
// List<String> altlocs = pdb.map(t -> t._1 + "_altLocs_" + Arrays.toString(t._2.getAltLocIds())).collect();
// System.out.println("full: " + altlocs);
pdb = pdb.mapValues(v -> ReducedEncoder.getReduced(v)).cache();
// chainIds = pdb.map(t -> t._1 + "_chainId_" + Arrays.toString(t._2.getChainIds())).collect();
// System.out.println("reduced: " + chainIds);
// chainNames = pdb.map(t -> t._1 + "_chainNames_" + Arrays.toString(t._2.getChainNames())).collect();
// System.out.println("reduced: " + chainNames);
// altlocs = pdb.map(t -> t._1 + "_altLocs_" + Arrays.toString(t._2.getAltLocIds())).collect();
// System.out.println("reduced: " + altlocs);
// 1STP # groups 121 CA + 1 BTN = 122
// 4HHB # groups 141x2 + 146x2 CA + 4 HEM + 2P (from PO4) = 580
// 2ONX # groups 4 CA = 4
// 2CVV # atoms 99 CA + 4 altloc CA + 1 A2G (sugar) + 1 NAG (orig 15) + 1 GOL + 1 ZN, 1 ACE = 108
// TODO (4 altlocs missing?)
// numGroups = pdb.map(t -> t._1 + "_numGroups_" + t._2.getNumGroups()).collect();
// System.out.println("reduced: " + numGroups);
List<String> atoms = pdb.map(t -> t._1 + "_atoms_" + t._2.getNumAtoms()).collect();
// System.out.println(atoms);
// 1STP # atoms 121 CA + 16 BTN
// 4HHB # atom 141x2 + 146x2 CA + 43x4 HEM + 2P (from PO4) = 748
// 2ONX # atoms 4 CA
// 2CVV # atoms 99 CA + 4 (5?) altloc CA + 15 A2G (sugar) + 14 NAG (orig 15) + 6 GOL + 1 ZN, ACE 4 = 143
assertTrue(atoms.contains("1STP_atoms_137"));
assertTrue(atoms.contains("4HHB_atoms_748"));
assertTrue(atoms.contains("2ONX_atoms_4"));
assertTrue(atoms.contains("2CCV_atoms_143"));
List<String> bonds = pdb.map(t -> t._1 + "_bonds_" + t._2.getNumBonds()).collect();
// 1STP # bond 17 BTN
// 4HHB # bonds 50 x 4 HEM = 200
// 2ONX # bonds 0
// 2CVV # bonds 15 A2G+ 14 NAG (-O) + 5 GOL + 3 ACE + 2 disulfide bridges + 1 covalent bond to NAG = 40
assertTrue(bonds.contains("1STP_bonds_17"));
assertTrue(bonds.contains("4HHB_bonds_200"));
assertTrue(bonds.contains("2ONX_bonds_0"));
assertTrue(bonds.contains("2CCV_bonds_40"));
sc.close();
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mm-dev by sbl-sdsc.
the class DemoAllVsAll method main.
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(DemoAllVsAll.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// Read PDB and create a Pisces non-redundant set at 20% sequence identity and a resolution better than 1.6 A.
// Then take a 1% random sample.
double fraction = 0.01;
// optional command line argument
if (args.length == 1) {
fraction = Double.parseDouble(args[0]);
}
long seed = 123;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(20, 1.6)).sample(false, fraction, seed);
System.out.println(pdb.count());
// run the structural alignment
String algorithmName = FatCatRigid.algorithmName;
Dataset<Row> alignments = StructureAligner.getAllVsAllAlignments(pdb, algorithmName).cache();
// show results
int count = (int) alignments.count();
alignments.show(count);
System.out.println("Pairs: " + count);
long end = System.nanoTime();
System.out.println("Time per alignment: " + TimeUnit.NANOSECONDS.toMillis((end - start) / count) + " msec.");
System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
sc.close();
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mm-dev by sbl-sdsc.
the class Driver1 method main.
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(Driver1.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
long start = System.nanoTime();
// download query structure
// List<String> queryId = Arrays.asList("2O9U");
List<String> queryId = Arrays.asList("1STP");
JavaPairRDD<String, StructureDataInterface> query = MmtfReader.downloadReducedMmtfFiles(queryId, sc).flatMapToPair(new StructureToPolymerChains(false, true));
// Examples similar: 4N6T, 2CH9, 3UL5, 3KVP
// Examples dissimilar: 5O5I, 1STP,
// List<String> targetId = Arrays.asList("4N6T", "2CH9", "3UL5", "3KVP", "1STP", "5O5I");
List<String> targetId = Arrays.asList("4OKA");
JavaPairRDD<String, StructureDataInterface> target = MmtfReader.downloadReducedMmtfFiles(targetId, sc).flatMapToPair(new StructureToPolymerChains(false, true));
// two standard algorithms
// String alignmentAlgorithm = CeMain.algorithmName;
// String alignmentAlgorithm = FatCatRigid.algorithmName;
String alignmentAlgorithm = "exhaustive";
// calculate alignments
Dataset<Row> alignments = StructureAligner.getQueryVsAllAlignments(query, target, alignmentAlgorithm).cache();
alignments.coalesce(1).write().mode("overwrite").format("csv").save(args[0]);
// show results
int count = (int) alignments.count();
alignments.sort(col("tm").desc()).show(count);
System.out.println("Pairs: " + count);
long end = System.nanoTime();
System.out.println("Time per alignment: " + TimeUnit.NANOSECONDS.toMillis((end - start) / count) + " msec.");
System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
sc.close();
}
Aggregations