use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.
the class PolyPeptideChainStatistics method main.
public static void main(String[] args) throws FileNotFoundException {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(PolyPeptideChainStatistics.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
JavaDoubleRDD chainLengths = MmtfReader.readReducedSequenceFile(// read PDB from MMTF-Hadoop sequence file
sc).flatMapToPair(// split (flatmap) into unique polymer chains
new StructureToPolymerChains(false, true)).filter(// only consider chains that contain the 20 standard aminoacids
new PolymerComposition(PolymerComposition.AMINO_ACIDS_20)).mapToDouble(// get the number of groups (residues) in each chain using a lambda expression
t -> t._2.getNumGroups());
System.out.println("Protein chains length statistics for proteins in the PDB with the 20 standard amino acids:");
System.out.println(chainLengths.stats());
sc.close();
}
use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.
the class PdbSequenceToWord2Vec method main.
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 1) {
System.err.println("Usage: " + PdbSequenceToWord2Vec.class.getSimpleName() + " <outputFileName>");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureWord2VecEncoder.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant Pisces
// subset set (<=40% seq. identity) of L-protein chains
int sequenceIdentity = 40;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
Dataset<Row> data = PolymerSequenceExtractor.getDataset(pdb);
data.show(10, false);
// length of polymer sequence segment (number of residues)
int segmentLength = 11;
// add Word2Vec encoded feature vector
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
// size of n-grams
int n = 2;
int windowSize = (segmentLength - 1) / 2;
// dimension of vector
int vectorSize = 50;
data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);
encoder.getWord2VecModel().save(args[0]);
long end = System.nanoTime();
System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureBlosum62Encoder method main.
/**
* @param args args[0] outputFilePath, args[1] outputFormat (json|parquet)
* @throws IOException
* @throws StructureException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 2) {
System.err.println("Usage: " + SecondaryStructureBlosum62Encoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureBlosum62Encoder.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant Pisces
// subset set (<=20% seq. identity) of L-protein chains
int sequenceIdentity = 20;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
int segmentLength = 11;
Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache();
System.out.println("original data : " + data.count());
data = data.dropDuplicates("labelQ3", "sequence").cache();
System.out.println("- duplicate Q3/seq: " + data.count());
data = data.dropDuplicates("sequence").cache();
System.out.println("- duplicate seq : " + data.count());
// add a property encoded feature vector
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
data = encoder.blosum62Encode();
data.printSchema();
data.show(25, false);
if (args[1].equals("json")) {
// coalesce data into a single file
data = data.coalesce(1);
}
data.write().mode("overwrite").format(args[1]).save(args[0]);
long end = System.nanoTime();
System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureOneHotEncoder method main.
/**
* @param args args[0] outputFilePath, args[1] outputFormat (json|parquet)
* @throws IOException
* @throws StructureException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length < 2) {
System.err.println("Usage: " + SecondaryStructureOneHotEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat> + [<modelFileName>]");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureOneHotEncoder.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant Pisces
// subset set (<=20% seq. identity) of L-protein chains
int sequenceIdentity = 20;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
// get content
int segmentLength = 11;
Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache();
System.out.println("original data : " + data.count());
data = data.dropDuplicates("labelQ3", "sequence").cache();
System.out.println("- duplicate Q3/seq: " + data.count());
data = data.dropDuplicates("sequence").cache();
System.out.println("- duplicate seq : " + data.count());
// add one-hot encoded sequence feature vector to dataset
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
data = encoder.oneHotEncode();
data.printSchema();
data.show(25, false);
if (args[1].equals("json")) {
// coalesce data into a single file
data = data.coalesce(1);
}
data.write().mode("overwrite").format(args[1]).save(args[0]);
long end = System.nanoTime();
System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.
the class SecondaryStructurePropertyEncoder method main.
/**
* @param args outputFilePath outputFormat (json|parquet)
* @throws IOException
* @throws StructureException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 2) {
System.err.println("Usage: " + SecondaryStructurePropertyEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructurePropertyEncoder.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant Pisces
// subset set (<=20% seq. identity) of L-protein chains
int sequenceIdentity = 20;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
// get content
int segmentLength = 11;
Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache();
System.out.println("original data : " + data.count());
data = data.dropDuplicates("labelQ3", "sequence").cache();
System.out.println("- duplicate Q3/seq: " + data.count());
data = data.dropDuplicates("sequence").cache();
System.out.println("- duplicate seq : " + data.count());
// add a property encoded feature vector
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
data = encoder.propertyEncode();
data.printSchema();
data.show(25, false);
if (args[1].equals("json")) {
// coalesce data into a single file
data = data.coalesce(1);
}
data.write().mode("overwrite").format(args[1]).save(args[0]);
long end = System.nanoTime();
System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
Aggregations