Search in sources :

Example 1 with GroupInteractionExtractor

use of edu.sdsc.mmtf.spark.datasets.GroupInteractionExtractor in project mmtf-spark by sbl-sdsc.

the class AtpInteractionAnalysis method main.

/**
 * @param args input arguments
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfFullPath();
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(AtpInteractionAnalysis.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
    // filter by sequence identity subset
    int sequenceIdentity = 20;
    double resolution = 2.0;
    pdb = pdb.filter(new Pisces(sequenceIdentity, resolution));
    // find ATP interactions within 3 Angstroms
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ATP", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    // TODO add a line to only analyze interactions
    // with the oxygens in the terminal phosphate group of ATP
    // (O1G, O2G, O3G)
    // Tip: Google SQL LIKE
    interactions = interactions.filter("atom1 LIKE('O%G')");
    // show the data schema of the dataset and some data
    interactions.printSchema();
    interactions.show(20);
    long n = interactions.count();
    System.out.println("# interactions: " + n);
    System.out.println("Top interacting groups");
    Dataset<Row> topGroups = interactions.groupBy("residue2").count();
    topGroups.sort(// sort descending by count
    col("count").desc()).show(10);
    System.out.println("Top interacting group/atoms types");
    Dataset<Row> topGroupsAndAtoms = interactions.groupBy("residue2", "atom2").count();
    topGroupsAndAtoms.withColumn("frequency", // add column with frequency of occurrence
    col("count").divide(n)).sort(// sort descending
    col("frequency").desc()).show(10);
    long end = System.nanoTime();
    System.out.println("Time:     " + (end - start) / 1E9 + "sec.");
    sc.close();
}
Also used : StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf) GroupInteractionExtractor(edu.sdsc.mmtf.spark.datasets.GroupInteractionExtractor)

Example 2 with GroupInteractionExtractor

use of edu.sdsc.mmtf.spark.datasets.GroupInteractionExtractor in project mmtf-spark by sbl-sdsc.

the class InteractionAnalysisAdvanced method main.

/**
 * @param args no input arguments
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfFullPath();
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisAdvanced.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
    // get non-redundant subset
    pdb = pdb.filter(new Pisces(40, 2.5));
    // find Zinc interactions within 3 Angstroms
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    // show the data schema of the dataset and some data
    interactions.printSchema();
    interactions.show(20);
    long n = interactions.count();
    System.out.println("# interactions: " + n);
    System.out.println("Top interacting groups");
    Dataset<Row> topGroups = interactions.groupBy("residue2").count();
    topGroups.sort(// sort descending by count
    col("count").desc()).show(10);
    System.out.println("Top interacting group/atoms types");
    Dataset<Row> topGroupsAndAtoms = interactions.filter(// exclude carbon interactions
    "element2 != 'C'").groupBy("residue2", "atom2").count();
    topGroupsAndAtoms.withColumn("frequency", // add column with frequency of occurrence
    col("count").divide(n)).filter(// filter out occurrences < 1 %
    "frequency > 0.01").sort(// sort descending
    col("frequency").desc()).show(20);
    // TODO print the top 10 interacting elements
    System.out.println("Top interacting elements");
    Dataset<Row> topElements = interactions.filter(// exclude carbon interactions
    "element2 != 'C'").groupBy("element2").count();
    topElements.withColumn("frequency", col("count").divide(n)).filter(// filter out occurrences < 1 %
    "frequency > 0.01").sort(// sort descending
    col("frequency").desc()).show(10);
    interactions.groupBy("element2").avg("distance").sort("avg(distance)").show(10);
    // Aggregate multiple statistics
    // Note: import static org.apache.spark.sql.functions.* required!
    // e.g. org.apache.spark.sql.functions.avg
    // for a list of all available functions
    interactions.groupBy("element2").agg(count("distance"), avg("distance"), min("distance"), max("distance"), kurtosis("distance")).show(10);
    long end = System.nanoTime();
    System.out.println("Time:     " + (end - start) / 1E9 + "sec.");
    sc.close();
}
Also used : Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf) GroupInteractionExtractor(edu.sdsc.mmtf.spark.datasets.GroupInteractionExtractor)

Example 3 with GroupInteractionExtractor

use of edu.sdsc.mmtf.spark.datasets.GroupInteractionExtractor in project mmtf-spark by sbl-sdsc.

the class InteractionAnalysisSimple method main.

/**
 * @param args no input arguments
 * @throws IOException if MmtfReader fails
 */
public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfFullPath();
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisSimple.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
    // use only representative structures
    int sequenceIdentity = 40;
    double resolution = 2.5;
    pdb = pdb.filter(new Pisces(sequenceIdentity, resolution));
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    // list the top 10 residue types that interact with Zn
    interactions.printSchema();
    interactions.show(20);
    System.out.println("# interactions: " + interactions.count());
    // show the top 10 interacting groups
    interactions.groupBy(col("residue2")).count().sort(col("count").desc()).show(10);
    long end = System.nanoTime();
    System.out.println("Time:     " + (end - start) / 1E9 + "sec.");
    sc.close();
}
Also used : StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf) GroupInteractionExtractor(edu.sdsc.mmtf.spark.datasets.GroupInteractionExtractor)

Aggregations

GroupInteractionExtractor (edu.sdsc.mmtf.spark.datasets.GroupInteractionExtractor)3 Pisces (edu.sdsc.mmtf.spark.webfilters.Pisces)3 SparkConf (org.apache.spark.SparkConf)3 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)3 Row (org.apache.spark.sql.Row)3 StructureDataInterface (org.rcsb.mmtf.api.StructureDataInterface)3