use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.
the class SequenceSimilarityDemo method main.
/**
* @throws IOException
*/
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SequenceSimilarityDemo.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
String sequence = "NLVQFGVMIEKMTGKSALQYNDYGCYCGIGGSHWPVDQ";
double eValueCutoff = 0.001;
int sequenceIdentityCutoff = 40;
boolean maskLowComplexity = true;
// read PDB in MMTF format, split into polymer chains,
// search by sequence similarity, and print sequences found
MmtfReader.readReducedSequenceFile(sc).flatMapToPair(new StructureToPolymerChains(false, true)).filter(new SequenceSimilarity(sequence, SequenceSimilarity.BLAST, eValueCutoff, sequenceIdentityCutoff, maskLowComplexity)).foreach(t -> System.out.println(t._1 + ": " + t._2.getEntitySequence(0)));
sc.close();
}
use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureWord2VecModelEncoder method main.
/**
* @param args args[0] outputFilePath, args[1] outputFormat (json|parquet), args[3] word2VecModelFile
* @throws IOException
* @throws StructureException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfFullPath();
if (args.length != 3) {
System.err.println("Usage: " + SecondaryStructureWord2VecModelEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat> + <word2VecModelFile>");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureWord2VecModelEncoder.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant Pisces
// subset set (<=20% seq. identity) of L-protein chains
int sequenceIdentity = 20;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
// get content
int segmentLength = 11;
Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);
// add Word2Vec encoded feature vector using
// a pre-trained Word2Vec model read from file
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
int n = 2;
String modelFileName = args[2];
data = encoder.overlappingNgramWord2VecEncode(modelFileName, n).cache();
data.printSchema();
data.show(25, false);
if (args[1].equals("json")) {
// coalesce data into a single file
data = data.coalesce(1);
}
data.write().mode("overwrite").format(args[1]).save(args[0]);
long end = System.nanoTime();
System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mm-dev by sbl-sdsc.
the class DemoAllVsAll_cluster method main.
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
long start = System.nanoTime();
SparkConf conf = new SparkConf();
JavaSparkContext sc = new JavaSparkContext(conf);
// Read PDB and create a Pisces non-redundant set at 20% sequence identity and a resolution better than 1.6 A.
// Then take a 1% random sample.
double fraction = 0.01;
// optional command line argument
if (args.length == 1) {
fraction = Double.parseDouble(args[0]);
}
long seed = 123;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(20, 1.6)).sample(false, fraction, seed);
System.out.println(pdb.count());
// run the structural alignment
String algorithmName = FatCatRigid.algorithmName;
Dataset<Row> alignments = StructureAligner.getAllVsAllAlignments(pdb, algorithmName).cache();
// show results
int count = (int) alignments.count();
alignments.show(count);
System.out.println("Pairs: " + count);
long end = System.nanoTime();
System.out.println("Time per alignment: " + TimeUnit.NANOSECONDS.toMillis((end - start) / count) + " msec.");
System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
sc.close();
}
use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mm-dev by sbl-sdsc.
the class DemoQueryVsAll method main.
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(DemoQueryVsAll.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
long start = System.nanoTime();
// download query structure
List<String> queryId = Arrays.asList("2W47");
JavaPairRDD<String, StructureDataInterface> query = MmtfReader.downloadFullMmtfFiles(queryId, sc).flatMapToPair(new StructureToPolymerChains());
// use a 1 % sample of the PDB and then filter by the Pisces
// non-redundant set
// at 20% sequence identity and a resolution better than 1.6 A.
double fraction = 1.0;
long seed = 123;
JavaPairRDD<String, StructureDataInterface> target = MmtfReader.readSequenceFile(path, fraction, seed, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(20, 1.6)).sample(false, 0.08, seed);
// specialized algorithms
// String alignmentAlgorithm = CeMain.algorithmName;
// String alignmentAlgorithm = CeCPMain.algorithmName;
// String alignmentAlgorithm = FatCatFlexible.algorithmName;
// two standard algorithms
// String alignmentAlgorithm = CeMain.algorithmName;
String alignmentAlgorithm = FatCatRigid.algorithmName;
// String alignmentAlgorithm = ExhaustiveAligner.alignmentAlgorithm;
// calculate alignments
Dataset<Row> alignments = StructureAligner.getQueryVsAllAlignments(query, target, alignmentAlgorithm).cache();
// show results
int count = (int) alignments.count();
alignments.sort(col("tm").desc()).show(count);
System.out.println("Pairs: " + count);
long end = System.nanoTime();
System.out.println("Time per alignment: " + TimeUnit.NANOSECONDS.toMillis((end - start) / count) + " msec.");
System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
sc.close();
}
use of edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains in project mm-dev by sbl-sdsc.
the class CathClassificationDataset method getSequenceData.
private static Dataset<Row> getSequenceData(String[] args) throws IOException {
SparkSession spark = SparkSession.builder().master("local[*]").getOrCreate();
JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Row> pdb = MmtfReader.readSequenceFile(args[0], sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(40, 2.0)).map(t -> RowFactory.create(t._1, t._2.getEntitySequence(0)));
// Generate the schema
StructType schema = new StructType(new StructField[] { new StructField("structureChainId", DataTypes.StringType, false, Metadata.empty()), new StructField("sequence", DataTypes.StringType, false, Metadata.empty()) });
// Apply the schema to the RDD
return spark.createDataFrame(pdb, schema);
}
Aggregations