use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureWord2VecEncoder method main.
/**
* @param args outputFilePath outputFormat (json|parquet)
* @throws IOException
* @throws StructureException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 2) {
System.err.println("Usage: " + SecondaryStructureWord2VecEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureWord2VecEncoder.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant Pisces
// subset set (<=20% seq. identity) of L-protein chains
int sequenceIdentity = 20;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
// get content
int segmentLength = 11;
Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);
// add Word2Vec encoded feature vector
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
int n = 2;
int windowSize = (segmentLength - 1) / 2;
int vectorSize = 50;
data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);
data.printSchema();
data.show(25, false);
if (args[1].equals("json")) {
// coalesce data into a single file
data = data.coalesce(1);
}
data.write().mode("overwrite").format(args[1]).save(args[0]);
long end = System.nanoTime();
System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.
the class DSSPDemo method main.
public static void main(String[] args) throws IOException {
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// single protein chain
List<String> pdbIds = Arrays.asList("1STP");
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache();
pdb = pdb.flatMapToPair(new StructureToPolymerChains());
Dataset<Row> ds = SecondaryStructureExtractor.getDataset(pdb);
// show the schema of this dataset
ds.printSchema();
ds.show(2, false);
long end = System.nanoTime();
System.out.println("Time: " + (end - start) / 1E9 + "sec.");
sc.close();
}
use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureElementDemo method main.
public static void main(String[] args) throws IOException {
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// single protein chain
List<String> pdbIds = Arrays.asList("1STP");
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadReducedMmtfFiles(pdbIds, sc).cache();
pdb = pdb.flatMapToPair(new StructureToPolymerChains()).filter(new ContainsLProteinChain());
Dataset<Row> ds = SecondaryStructureElementExtractor.getDataset(pdb, "E", 6);
// show the top 50 rows of this dataset
ds.show(50, false);
long end = System.nanoTime();
System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
sc.close();
}
use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureSegmentDemo method main.
public static void main(String[] args) throws IOException {
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(CustomReportDemo.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// single protein chain
List<String> pdbIds = Arrays.asList("1STP");
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadReducedMmtfFiles(pdbIds, sc).cache();
pdb = pdb.flatMapToPair(new StructureToPolymerChains()).filter(new ContainsLProteinChain());
int segmentLength = 25;
Dataset<Row> ds = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);
// show the top 50 rows of this dataset
ds.show(50, false);
long end = System.nanoTime();
System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
sc.close();
}
use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.
the class MmtfImporterTest method test3.
@Test
public void test3() throws IOException {
Path p = Paths.get("./src/main/resources/files/test");
JavaPairRDD<String, StructureDataInterface> pdb = MmtfImporter.importPdbFiles(p.toString(), sc);
assertTrue(pdb.count() == 1);
pdb = pdb.flatMapToPair(new StructureToPolymerChains());
assertEquals(8, pdb.count());
}
Aggregations