use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.
the class MmtfImporterTest method test4.
@Test
public void test4() throws IOException {
Path p = Paths.get("./src/main/resources/files/test");
JavaPairRDD<String, StructureDataInterface> pdb = MmtfImporter.importMmcifFiles(p.toString(), sc);
assertTrue(pdb.count() == 1);
pdb = pdb.flatMapToPair(new StructureToPolymerChains());
assertEquals(8, pdb.count());
}
use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.
the class MmtfReaderTest method test2.
@Test
public void test2() throws IOException {
Path p = Paths.get("./src/main/resources/files/test");
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readMmtfFiles(p.toString(), sc);
assertTrue(pdb.count() == 1);
pdb = pdb.flatMapToPair(new StructureToPolymerChains());
assertTrue(pdb.count() == 8);
}
use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.
the class KinaseSearch method main.
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(KinaseSearch.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// query for human protein-serine/threonine kinases using SIFTS data
String sql = "SELECT t.pdbid, t.chain FROM sifts.pdb_chain_taxonomy AS t " + "JOIN sifts.pdb_chain_enzyme AS e ON (t.pdbid = e.pdbid AND t.chain = e.chain) " + "WHERE t.scientific_name = 'Homo sapiens' AND e.ec_number = '2.7.11.1'";
// read PDB in MMTF format, split into polymer chains and search using
// PdbJMineSearch
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readReducedSequenceFile(sc).flatMapToPair(new StructureToPolymerChains()).filter(new PdbjMineSearch(sql));
System.out.println("Number of entries matching query: " + pdb.count());
sc.close();
}
use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mm-dev by sbl-sdsc.
the class ShapeTypeDemo method main.
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 1) {
System.err.println("Usage: " + ShapeTypeDemo.class.getSimpleName() + " <dataset output file");
System.exit(1);
}
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ShapeTypeDemo.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
long start = System.nanoTime();
// load a representative PDB chain from the 40% seq. identity Blast Clusters
int sequenceIdentity = 90;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(// extract polymer chains
new StructureToPolymerChains()).filter(// get representative subset
new Pisces(sequenceIdentity, 2.5));
// get a data set with sequence info
Dataset<Row> seqData = PolymerSequenceExtractor.getDataset(pdb);
// convert to BioJava data structure
JavaPairRDD<String, Structure> structures = pdb.mapValues(new StructureToBioJava());
// calculate shape data and convert to dataset
JavaRDD<Row> rows = structures.map(t -> getShapeData(t));
Dataset<Row> data = JavaRDDToDataset.getDataset(rows, "structureChainId", "shape");
// there are only few symmetric chain, leave them out
data = data.filter("shape != 'EXCLUDE'");
// join calculated data with the sequence data
data = seqData.join(data, "structureChainId").cache();
data.show(10);
// create a Word2Vector representation of the protein sequences
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
// create 2-grams
int n = 2;
// 25-amino residue window size for Word2Vector
int windowSize = 25;
// dimension of feature vector
int vectorSize = 50;
data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize).cache();
// save data in .parquet file
data.write().mode("overwrite").format("parquet").save(args[0]);
long end = System.nanoTime();
System.out.println((end - start) / 1E9 + " sec.");
sc.close();
}
use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mm-dev by sbl-sdsc.
the class DemoAllVsAll method main.
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(DemoAllVsAll.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// Read PDB and create a Pisces non-redundant set at 20% sequence identity and a resolution better than 1.6 A.
// Then take a 1% random sample.
double fraction = 0.01;
// optional command line argument
if (args.length == 1) {
fraction = Double.parseDouble(args[0]);
}
long seed = 123;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(20, 1.6)).sample(false, fraction, seed);
System.out.println(pdb.count());
// run the structural alignment
String algorithmName = FatCatRigid.algorithmName;
Dataset<Row> alignments = StructureAligner.getAllVsAllAlignments(pdb, algorithmName).cache();
// show results
int count = (int) alignments.count();
alignments.show(count);
System.out.println("Pairs: " + count);
long end = System.nanoTime();
System.out.println("Time per alignment: " + TimeUnit.NANOSECONDS.toMillis((end - start) / count) + " msec.");
System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
sc.close();
}
Aggregations