Search in sources :

Example 1 with Pisces

use of edu.sdsc.mmtf.spark.webfilters.Pisces in project mmtf-spark by sbl-sdsc.

the class AtpInteractionAnalysis method main.

/**
 * @param args input arguments
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfFullPath();
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(AtpInteractionAnalysis.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
    // filter by sequence identity subset
    int sequenceIdentity = 20;
    double resolution = 2.0;
    pdb = pdb.filter(new Pisces(sequenceIdentity, resolution));
    // find ATP interactions within 3 Angstroms
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ATP", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    // TODO add a line to only analyze interactions
    // with the oxygens in the terminal phosphate group of ATP
    // (O1G, O2G, O3G)
    // Tip: Google SQL LIKE
    interactions = interactions.filter("atom1 LIKE('O%G')");
    // show the data schema of the dataset and some data
    interactions.printSchema();
    interactions.show(20);
    long n = interactions.count();
    System.out.println("# interactions: " + n);
    System.out.println("Top interacting groups");
    Dataset<Row> topGroups = interactions.groupBy("residue2").count();
    topGroups.sort(// sort descending by count
    col("count").desc()).show(10);
    System.out.println("Top interacting group/atoms types");
    Dataset<Row> topGroupsAndAtoms = interactions.groupBy("residue2", "atom2").count();
    topGroupsAndAtoms.withColumn("frequency", // add column with frequency of occurrence
    col("count").divide(n)).sort(// sort descending
    col("frequency").desc()).show(10);
    long end = System.nanoTime();
    System.out.println("Time:     " + (end - start) / 1E9 + "sec.");
    sc.close();
}
Also used : StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf) GroupInteractionExtractor(edu.sdsc.mmtf.spark.datasets.GroupInteractionExtractor)

Example 2 with Pisces

use of edu.sdsc.mmtf.spark.webfilters.Pisces in project mmtf-spark by sbl-sdsc.

the class InteractionAnalysisAdvanced method main.

/**
 * @param args no input arguments
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfFullPath();
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisAdvanced.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
    // get non-redundant subset
    pdb = pdb.filter(new Pisces(40, 2.5));
    // find Zinc interactions within 3 Angstroms
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    // show the data schema of the dataset and some data
    interactions.printSchema();
    interactions.show(20);
    long n = interactions.count();
    System.out.println("# interactions: " + n);
    System.out.println("Top interacting groups");
    Dataset<Row> topGroups = interactions.groupBy("residue2").count();
    topGroups.sort(// sort descending by count
    col("count").desc()).show(10);
    System.out.println("Top interacting group/atoms types");
    Dataset<Row> topGroupsAndAtoms = interactions.filter(// exclude carbon interactions
    "element2 != 'C'").groupBy("residue2", "atom2").count();
    topGroupsAndAtoms.withColumn("frequency", // add column with frequency of occurrence
    col("count").divide(n)).filter(// filter out occurrences < 1 %
    "frequency > 0.01").sort(// sort descending
    col("frequency").desc()).show(20);
    // TODO print the top 10 interacting elements
    System.out.println("Top interacting elements");
    Dataset<Row> topElements = interactions.filter(// exclude carbon interactions
    "element2 != 'C'").groupBy("element2").count();
    topElements.withColumn("frequency", col("count").divide(n)).filter(// filter out occurrences < 1 %
    "frequency > 0.01").sort(// sort descending
    col("frequency").desc()).show(10);
    interactions.groupBy("element2").avg("distance").sort("avg(distance)").show(10);
    // Aggregate multiple statistics
    // Note: import static org.apache.spark.sql.functions.* required!
    // e.g. org.apache.spark.sql.functions.avg
    // for a list of all available functions
    interactions.groupBy("element2").agg(count("distance"), avg("distance"), min("distance"), max("distance"), kurtosis("distance")).show(10);
    long end = System.nanoTime();
    System.out.println("Time:     " + (end - start) / 1E9 + "sec.");
    sc.close();
}
Also used : Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf) GroupInteractionExtractor(edu.sdsc.mmtf.spark.datasets.GroupInteractionExtractor)

Example 3 with Pisces

use of edu.sdsc.mmtf.spark.webfilters.Pisces in project mmtf-spark by sbl-sdsc.

the class InteractionAnalysisSimple method main.

/**
 * @param args no input arguments
 * @throws IOException if MmtfReader fails
 */
public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfFullPath();
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisSimple.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
    // use only representative structures
    int sequenceIdentity = 40;
    double resolution = 2.5;
    pdb = pdb.filter(new Pisces(sequenceIdentity, resolution));
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    // list the top 10 residue types that interact with Zn
    interactions.printSchema();
    interactions.show(20);
    System.out.println("# interactions: " + interactions.count());
    // show the top 10 interacting groups
    interactions.groupBy(col("residue2")).count().sort(col("count").desc()).show(10);
    long end = System.nanoTime();
    System.out.println("Time:     " + (end - start) / 1E9 + "sec.");
    sc.close();
}
Also used : StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf) GroupInteractionExtractor(edu.sdsc.mmtf.spark.datasets.GroupInteractionExtractor)

Example 4 with Pisces

use of edu.sdsc.mmtf.spark.webfilters.Pisces in project mmtf-spark by sbl-sdsc.

the class MapToBioAssembly2 method main.

public static void main(String[] args) throws FileNotFoundException, IOException {
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(CustomReportDemo.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    long start = System.nanoTime();
    // List<String> pdbIds = Arrays.asList("1HV4");
    // List<String> pdbIds = Arrays.asList("2HHB");
    // JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
    // .downloadFullMmtfFiles(pdbIds, sc);
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readFullSequenceFile(sc).filter(new Pisces(20, 3.0));
    // System.out.println("**** AU ****");
    // pdb.foreach(t -> TraverseStructureHierarchy.printStructureData(t._2));
    JavaPairRDD<String, StructureDataInterface> bioassemblies = pdb.flatMapToPair(new StructureToBioassembly2());
    System.out.println("Number of bioassemblies: " + bioassemblies.count());
    long end = System.nanoTime();
    System.out.println("time: " + (end - start) / 1E9 + " sec.");
    // System.out.println("**** BA ****");
    // bioassemblies.foreach(t -> TraverseStructureHierarchy.printStructureData(t._2));
    // bioassemblies.foreach(t -> TraverseStructureHierarchy.printChainEntityGroupAtomInfo(t._2));
    sc.close();
}
Also used : Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) StructureToBioassembly2(edu.sdsc.mmtf.spark.mappers.StructureToBioassembly2) CustomReportDemo(edu.sdsc.mmtf.spark.datasets.demos.CustomReportDemo) SparkConf(org.apache.spark.SparkConf)

Example 5 with Pisces

use of edu.sdsc.mmtf.spark.webfilters.Pisces in project mmtf-spark by sbl-sdsc.

the class ProteinFoldDatasetCreator method main.

/**
 * @param args
 * @throws IOException
 * @throws StructureException
 */
public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfReducedPath();
    if (args.length != 1) {
        System.err.println("Usage: " + ProteinFoldDatasetCreator.class.getSimpleName() + " <dataset output file");
        System.exit(1);
    }
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ProteinFoldDatasetCreator.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read MMTF Hadoop sequence file and create a non-redundant Pisces
    // subset set (<=20% seq. identity) of L-protein chains
    int sequenceIdentity = 20;
    double resolution = 3.0;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
    // get secondary structure content
    Dataset<Row> data = SecondaryStructureExtractor.getDataset(pdb);
    // classify chains by secondary structure type
    double minThreshold = 0.05;
    double maxThreshold = 0.15;
    data = addProteinFoldType(data, minThreshold, maxThreshold);
    // create a binary classification dataset
    data = data.filter("foldType = 'alpha' OR foldType = 'beta'").cache();
    // create a three-state classification model (alpha, beta, alpha+beta)
    // data = data.filter("foldType != 'other'").cache();
    // add Word2Vec encoded feature vector
    ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
    int n = 2;
    int windowSize = 11;
    int vectorSize = 50;
    data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);
    data.printSchema();
    data.show(25);
    // keep only a subset of relevant fields for further processing
    data = data.select("structureChainId", "alpha", "beta", "coil", "foldType", "features");
    data.write().mode("overwrite").format("parquet").save(args[0]);
    long end = System.nanoTime();
    System.out.println((end - start) / 1E9 + " sec");
}
Also used : ProteinSequenceEncoder(edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Aggregations

Pisces (edu.sdsc.mmtf.spark.webfilters.Pisces)20 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)20 SparkConf (org.apache.spark.SparkConf)19 StructureDataInterface (org.rcsb.mmtf.api.StructureDataInterface)19 Row (org.apache.spark.sql.Row)18 StructureToPolymerChains (edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains)15 ProteinSequenceEncoder (edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder)10 GroupInteractionExtractor (edu.sdsc.mmtf.spark.datasets.GroupInteractionExtractor)3 CustomReportDemo (edu.sdsc.mmtf.spark.datasets.demos.CustomReportDemo)1 InteractionFilter (edu.sdsc.mmtf.spark.interactions.InteractionFilter)1 StructureToBioJava (edu.sdsc.mmtf.spark.mappers.StructureToBioJava)1 StructureToBioassembly2 (edu.sdsc.mmtf.spark.mappers.StructureToBioassembly2)1 SparkSession (org.apache.spark.sql.SparkSession)1 StructField (org.apache.spark.sql.types.StructField)1 StructType (org.apache.spark.sql.types.StructType)1 Structure (org.biojava.nbio.structure.Structure)1