use of edu.sdsc.mmtf.spark.webfilters.Pisces in project mmtf-spark by sbl-sdsc.
the class AtpInteractionAnalysis method main.
/**
* @param args input arguments
* @throws IOException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfFullPath();
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(AtpInteractionAnalysis.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read PDB in MMTF format
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
// filter by sequence identity subset
int sequenceIdentity = 20;
double resolution = 2.0;
pdb = pdb.filter(new Pisces(sequenceIdentity, resolution));
// find ATP interactions within 3 Angstroms
GroupInteractionExtractor finder = new GroupInteractionExtractor("ATP", 3);
Dataset<Row> interactions = finder.getDataset(pdb).cache();
// TODO add a line to only analyze interactions
// with the oxygens in the terminal phosphate group of ATP
// (O1G, O2G, O3G)
// Tip: Google SQL LIKE
interactions = interactions.filter("atom1 LIKE('O%G')");
// show the data schema of the dataset and some data
interactions.printSchema();
interactions.show(20);
long n = interactions.count();
System.out.println("# interactions: " + n);
System.out.println("Top interacting groups");
Dataset<Row> topGroups = interactions.groupBy("residue2").count();
topGroups.sort(// sort descending by count
col("count").desc()).show(10);
System.out.println("Top interacting group/atoms types");
Dataset<Row> topGroupsAndAtoms = interactions.groupBy("residue2", "atom2").count();
topGroupsAndAtoms.withColumn("frequency", // add column with frequency of occurrence
col("count").divide(n)).sort(// sort descending
col("frequency").desc()).show(10);
long end = System.nanoTime();
System.out.println("Time: " + (end - start) / 1E9 + "sec.");
sc.close();
}
use of edu.sdsc.mmtf.spark.webfilters.Pisces in project mmtf-spark by sbl-sdsc.
the class InteractionAnalysisAdvanced method main.
/**
* @param args no input arguments
* @throws IOException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfFullPath();
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisAdvanced.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read PDB in MMTF format
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
// get non-redundant subset
pdb = pdb.filter(new Pisces(40, 2.5));
// find Zinc interactions within 3 Angstroms
GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3);
Dataset<Row> interactions = finder.getDataset(pdb).cache();
// show the data schema of the dataset and some data
interactions.printSchema();
interactions.show(20);
long n = interactions.count();
System.out.println("# interactions: " + n);
System.out.println("Top interacting groups");
Dataset<Row> topGroups = interactions.groupBy("residue2").count();
topGroups.sort(// sort descending by count
col("count").desc()).show(10);
System.out.println("Top interacting group/atoms types");
Dataset<Row> topGroupsAndAtoms = interactions.filter(// exclude carbon interactions
"element2 != 'C'").groupBy("residue2", "atom2").count();
topGroupsAndAtoms.withColumn("frequency", // add column with frequency of occurrence
col("count").divide(n)).filter(// filter out occurrences < 1 %
"frequency > 0.01").sort(// sort descending
col("frequency").desc()).show(20);
// TODO print the top 10 interacting elements
System.out.println("Top interacting elements");
Dataset<Row> topElements = interactions.filter(// exclude carbon interactions
"element2 != 'C'").groupBy("element2").count();
topElements.withColumn("frequency", col("count").divide(n)).filter(// filter out occurrences < 1 %
"frequency > 0.01").sort(// sort descending
col("frequency").desc()).show(10);
interactions.groupBy("element2").avg("distance").sort("avg(distance)").show(10);
// Aggregate multiple statistics
// Note: import static org.apache.spark.sql.functions.* required!
// e.g. org.apache.spark.sql.functions.avg
// for a list of all available functions
interactions.groupBy("element2").agg(count("distance"), avg("distance"), min("distance"), max("distance"), kurtosis("distance")).show(10);
long end = System.nanoTime();
System.out.println("Time: " + (end - start) / 1E9 + "sec.");
sc.close();
}
use of edu.sdsc.mmtf.spark.webfilters.Pisces in project mmtf-spark by sbl-sdsc.
the class InteractionAnalysisSimple method main.
/**
* @param args no input arguments
* @throws IOException if MmtfReader fails
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfFullPath();
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisSimple.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read PDB in MMTF format
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
// use only representative structures
int sequenceIdentity = 40;
double resolution = 2.5;
pdb = pdb.filter(new Pisces(sequenceIdentity, resolution));
GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3);
Dataset<Row> interactions = finder.getDataset(pdb).cache();
// list the top 10 residue types that interact with Zn
interactions.printSchema();
interactions.show(20);
System.out.println("# interactions: " + interactions.count());
// show the top 10 interacting groups
interactions.groupBy(col("residue2")).count().sort(col("count").desc()).show(10);
long end = System.nanoTime();
System.out.println("Time: " + (end - start) / 1E9 + "sec.");
sc.close();
}
use of edu.sdsc.mmtf.spark.webfilters.Pisces in project mmtf-spark by sbl-sdsc.
the class MapToBioAssembly2 method main.
public static void main(String[] args) throws FileNotFoundException, IOException {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(CustomReportDemo.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
long start = System.nanoTime();
// List<String> pdbIds = Arrays.asList("1HV4");
// List<String> pdbIds = Arrays.asList("2HHB");
// JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
// .downloadFullMmtfFiles(pdbIds, sc);
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readFullSequenceFile(sc).filter(new Pisces(20, 3.0));
// System.out.println("**** AU ****");
// pdb.foreach(t -> TraverseStructureHierarchy.printStructureData(t._2));
JavaPairRDD<String, StructureDataInterface> bioassemblies = pdb.flatMapToPair(new StructureToBioassembly2());
System.out.println("Number of bioassemblies: " + bioassemblies.count());
long end = System.nanoTime();
System.out.println("time: " + (end - start) / 1E9 + " sec.");
// System.out.println("**** BA ****");
// bioassemblies.foreach(t -> TraverseStructureHierarchy.printStructureData(t._2));
// bioassemblies.foreach(t -> TraverseStructureHierarchy.printChainEntityGroupAtomInfo(t._2));
sc.close();
}
use of edu.sdsc.mmtf.spark.webfilters.Pisces in project mmtf-spark by sbl-sdsc.
the class ProteinFoldDatasetCreator method main.
/**
* @param args
* @throws IOException
* @throws StructureException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 1) {
System.err.println("Usage: " + ProteinFoldDatasetCreator.class.getSimpleName() + " <dataset output file");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ProteinFoldDatasetCreator.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant Pisces
// subset set (<=20% seq. identity) of L-protein chains
int sequenceIdentity = 20;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
// get secondary structure content
Dataset<Row> data = SecondaryStructureExtractor.getDataset(pdb);
// classify chains by secondary structure type
double minThreshold = 0.05;
double maxThreshold = 0.15;
data = addProteinFoldType(data, minThreshold, maxThreshold);
// create a binary classification dataset
data = data.filter("foldType = 'alpha' OR foldType = 'beta'").cache();
// create a three-state classification model (alpha, beta, alpha+beta)
// data = data.filter("foldType != 'other'").cache();
// add Word2Vec encoded feature vector
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
int n = 2;
int windowSize = 11;
int vectorSize = 50;
data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);
data.printSchema();
data.show(25);
// keep only a subset of relevant fields for further processing
data = data.select("structureChainId", "alpha", "beta", "coil", "foldType", "features");
data.write().mode("overwrite").format("parquet").save(args[0]);
long end = System.nanoTime();
System.out.println((end - start) / 1E9 + " sec");
}
Aggregations