use of edu.sdsc.mmtf.spark.webfilters.Pisces in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureElementsWord2VecEncoder method main.
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 0 && args.length != 2) {
System.err.println("Usage: " + SecondaryStructureElementsWord2VecEncoder.class.getSimpleName() + " [<outputFilePath> + <fileFormat>]");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureWord2VecEncoder.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant Pisces
// subset set (<=20% seq. identity) of L-protein chains
int sequenceIdentity = 20;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
int segmentLength = 11;
// extract helical sequence segments
Dataset<Row> data = SecondaryStructureElementExtractor.getDataset(pdb, "H", segmentLength);
System.out.println(data.count());
data.show(10, false);
// add Word2Vec encoded feature vector
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
int n = 2;
int windowSize = (segmentLength - 1) / 2;
int vectorSize = 50;
data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);
data.show(50, false);
// optionally, save results
if (args.length > 0) {
if (args[1].equals("json")) {
// coalesce data into a single file
data = data.coalesce(1);
}
data.write().mode("overwrite").format(args[1]).save(args[0]);
}
long end = System.nanoTime();
System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
use of edu.sdsc.mmtf.spark.webfilters.Pisces in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureShiftedWord2VecEncoder method main.
/**
* @param args args[0] outputFilePath, args[1] outputFormat (json|parquet)
* @throws IOException
* @throws StructureException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 2) {
System.err.println("Usage: " + SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant set (<=20% seq. identity)
// of L-protein chains
int sequenceIdentity = 20;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
// get content
int segmentLength = 11;
Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);
// create a Word2Vector representation of the protein sequences
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
int windowSize = (segmentLength - 1) / 2;
// dimension of feature vector (50)
int vectorSize = 50;
data = encoder.shifted3GramWord2VecEncode(windowSize, vectorSize).cache();
data.printSchema();
data.show(25, false);
if (args[1].equals("json")) {
// coalesce data into a single file
data = data.coalesce(1);
}
data.write().mode("overwrite").format(args[1]).save(args[0]);
long end = System.nanoTime();
System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
use of edu.sdsc.mmtf.spark.webfilters.Pisces in project mmtf-spark by sbl-sdsc.
the class CreateRepresentativeSet method main.
/**
* @throws IOException
*/
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(CreateRepresentativeSet.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// filter by representative protein chains at 40% sequence identify
// and 2.5 A resolution using the Pisces filter. Any pair of protein
// chains in the representative set will have <= 40% sequence identity.
int sequenceIdentity = 40;
double resolution = 2.5;
// read PDB, split entries into polymer chains, and filter by Pisces filter
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readReducedSequenceFile(sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
System.out.println("# representative chains: " + pdb.count());
// coalesce partitions to avoid saving many small files
pdb = pdb.coalesce(12);
// save representative set
String path = MmtfReader.getMmtfReducedPath();
MmtfWriter.writeSequenceFile(path + "_representatives_i40_r2.5", sc, pdb);
sc.close();
}
use of edu.sdsc.mmtf.spark.webfilters.Pisces in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureWord2VecModelEncoder method main.
/**
* @param args args[0] outputFilePath, args[1] outputFormat (json|parquet), args[3] word2VecModelFile
* @throws IOException
* @throws StructureException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfFullPath();
if (args.length != 3) {
System.err.println("Usage: " + SecondaryStructureWord2VecModelEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat> + <word2VecModelFile>");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureWord2VecModelEncoder.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant Pisces
// subset set (<=20% seq. identity) of L-protein chains
int sequenceIdentity = 20;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
// get content
int segmentLength = 11;
Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);
// add Word2Vec encoded feature vector using
// a pre-trained Word2Vec model read from file
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
int n = 2;
String modelFileName = args[2];
data = encoder.overlappingNgramWord2VecEncode(modelFileName, n).cache();
data.printSchema();
data.show(25, false);
if (args[1].equals("json")) {
// coalesce data into a single file
data = data.coalesce(1);
}
data.write().mode("overwrite").format(args[1]).save(args[0]);
long end = System.nanoTime();
System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
use of edu.sdsc.mmtf.spark.webfilters.Pisces in project mm-dev by sbl-sdsc.
the class DemoAllVsAll_cluster method main.
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
long start = System.nanoTime();
SparkConf conf = new SparkConf();
JavaSparkContext sc = new JavaSparkContext(conf);
// Read PDB and create a Pisces non-redundant set at 20% sequence identity and a resolution better than 1.6 A.
// Then take a 1% random sample.
double fraction = 0.01;
// optional command line argument
if (args.length == 1) {
fraction = Double.parseDouble(args[0]);
}
long seed = 123;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(20, 1.6)).sample(false, fraction, seed);
System.out.println(pdb.count());
// run the structural alignment
String algorithmName = FatCatRigid.algorithmName;
Dataset<Row> alignments = StructureAligner.getAllVsAllAlignments(pdb, algorithmName).cache();
// show results
int count = (int) alignments.count();
alignments.show(count);
System.out.println("Pairs: " + count);
long end = System.nanoTime();
System.out.println("Time per alignment: " + TimeUnit.NANOSECONDS.toMillis((end - start) / count) + " msec.");
System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
sc.close();
}
Aggregations