Search in sources :

Example 31 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class StructureToBioassembly2 method call.

@Override
public Iterator<Tuple2<String, StructureDataInterface>> call(Tuple2<String, StructureDataInterface> t) throws Exception {
    StructureDataInterface structure = t._2;
    // Map<Integer, Integer> atomMap = new HashMap<>();
    List<Tuple2<String, StructureDataInterface>> resList = new ArrayList<>();
    // for each of them, create one structure.
    for (int i = 0; i < structure.getNumBioassemblies(); i++) {
        int[] chainsPerModel = new int[structure.getNumModels()];
        // initiate the bioassembly structure.
        AdapterToStructureData bioAssembly = new AdapterToStructureData();
        // set the structureID.
        String structureId = structure.getStructureId() + "-BioAssembly" + structure.getBioassemblyName(i);
        int totAtoms = 0, totBonds = 0, totGroups = 0, totChains = 0, totModels = 0;
        int numTrans = structure.getNumTransInBioassembly(i);
        totModels = structure.getNumModels();
        int[][] bioChainList = new int[numTrans][];
        double[][] transMatrix = new double[numTrans][];
        // calculate the total data we will use to initialize the structure.
        for (int ii = 0; ii < numTrans; ii++) {
            bioChainList[ii] = structure.getChainIndexListForTransform(i, ii);
            transMatrix[ii] = structure.getMatrixForTransform(i, ii);
            for (int j = 0; j < totModels; j++) {
                totChains += bioChainList[ii].length;
                chainsPerModel[j] += bioChainList[ii].length;
                // System.out.println(bioChainList[ii].length + " "  + Arrays.toString(bioChainList[ii]));
                for (int k = 0, groupCounter = 0; k < structure.getChainsPerModel()[j]; k++) {
                    boolean adding = false;
                    for (int currChain : bioChainList[ii]) {
                        if (currChain == k)
                            adding = true;
                    }
                    if (adding) {
                        // System.out.println("adding groups");
                        totGroups += structure.getGroupsPerChain()[k];
                    }
                    for (int h = 0; h < structure.getGroupsPerChain()[k]; h++, groupCounter++) {
                        if (adding) {
                            int groupIndex = structure.getGroupTypeIndices()[groupCounter];
                            totAtoms += structure.getNumAtomsInGroup(groupIndex);
                            totBonds += structure.getGroupBondOrders(groupIndex).length;
                        }
                    }
                }
            }
        }
        // init
        // System.out.println("Initializing the structure with\n"
        // + " totModel = " + totModels + ", totChains = " + totChains + ", totGroups = " + totGroups + ", totAtoms = "
        // + totAtoms + ", totBonds = " + totBonds + ", name : " + structureId);
        bioAssembly.initStructure(totBonds, totAtoms, totGroups, totChains, totModels, structureId);
        DecoderUtils.addXtalographicInfo(structure, bioAssembly);
        DecoderUtils.addHeaderInfo(structure, bioAssembly);
        /*
			 * Now we have bioChainList and transMatrix.
			 * bioChainList[i] is the ith trans' list of chains it has.  
			 * transMatrix[i] is the matrix that is going to be applied on those chains.
			 */
        // initialize the indices.
        int modelIndex = 0;
        int chainIndex = 0;
        int groupIndex = 0;
        int atomIndex = 0;
        int chainCounter = 0;
        // loop through models
        for (int ii = 0; ii < structure.getNumModels(); ii++) {
            // precalculate indices
            // this number is not correct if BA has fewer chains than the AU
            // int numChainsPerModel = structure.getChainsPerModel()[modelIndex] * numTrans;
            int numChainsPerModel = chainsPerModel[ii];
            System.out.println("numChainsPerModel: " + numChainsPerModel);
            bioAssembly.setModelInfo(modelIndex, numChainsPerModel);
            int[] chainToEntityIndex = getChainToEntityIndex(structure);
            // loop through chains
            for (int j = 0; j < structure.getChainsPerModel()[modelIndex]; j++) {
                // loop through each trans
                int currGroupIndex = groupIndex;
                int currAtomIndex = atomIndex;
                for (int k = 0; k < numTrans; k++) {
                    // get the currChainList that needs to be added
                    int[] currChainList = bioChainList[k];
                    double[] currMatrix = transMatrix[k];
                    boolean addThisChain = false;
                    for (int currChain : currChainList) {
                        if (currChain == j)
                            addThisChain = true;
                    }
                    groupIndex = currGroupIndex;
                    atomIndex = currAtomIndex;
                    float[] xCoords = structure.getxCoords();
                    float[] yCoords = structure.getyCoords();
                    float[] zCoords = structure.getzCoords();
                    // float[] floatMatrix = Floats.toArray(Doubles.asList(currMatrix));
                    // Matrix4f m = new Matrix4f(floatMatrix);
                    Matrix4d md = new Matrix4d(currMatrix);
                    if (addThisChain) {
                        int entityToChainIndex = chainToEntityIndex[chainIndex];
                        // System.out.println("adding chain : " + chainIndex);
                        // TODO
                        // not sure
                        bioAssembly.setEntityInfo(new int[] { chainCounter }, structure.getEntitySequence(entityToChainIndex), structure.getEntityDescription(entityToChainIndex), structure.getEntityType(entityToChainIndex));
                        // TODO create unique chain ids
                        bioAssembly.setChainInfo(structure.getChainIds()[chainIndex] + (k + 1), structure.getChainNames()[chainIndex] + (k + 1), // bioAssembly.setChainInfo(structure.getChainIds()[chainIndex], structure.getChainNames()[chainIndex],
                        structure.getGroupsPerChain()[chainIndex]);
                        chainCounter++;
                    }
                    // loop through the groups in the chain
                    for (int jj = 0; jj < structure.getGroupsPerChain()[chainIndex]; jj++) {
                        int currgroup = structure.getGroupTypeIndices()[groupIndex];
                        if (addThisChain) {
                            bioAssembly.setGroupInfo(structure.getGroupName(currgroup), structure.getGroupIds()[groupIndex], structure.getInsCodes()[groupIndex], structure.getGroupChemCompType(currgroup), structure.getNumAtomsInGroup(currgroup), structure.getGroupBondOrders(currgroup).length, structure.getGroupSingleLetterCode(currgroup), structure.getGroupSequenceIndices()[groupIndex], structure.getSecStructList()[groupIndex]);
                        }
                        for (int kk = 0; kk < structure.getNumAtomsInGroup(currgroup); kk++) {
                            // System.out.println("currgroup : " + currgroup + " curratom : " + kk);
                            if (addThisChain) {
                                Point3f p1 = new Point3f(xCoords[atomIndex], yCoords[atomIndex], zCoords[atomIndex]);
                                // m.transform(p1);
                                md.transform(p1);
                                // System.out.println(kk + " " + currgroup);
                                bioAssembly.setAtomInfo(structure.getGroupAtomNames(currgroup)[kk], structure.getAtomIds()[atomIndex], structure.getAltLocIds()[atomIndex], p1.x, p1.y, p1.z, structure.getOccupancies()[atomIndex], structure.getbFactors()[atomIndex], structure.getGroupElementNames(currgroup)[kk], structure.getGroupAtomCharges(currgroup)[kk]);
                            }
                            // inc the atomIndex
                            atomIndex++;
                        }
                        if (addThisChain) {
                            for (int l = 0; l < structure.getGroupBondOrders(currgroup).length; l++) {
                                // System.out.println(structure.getGroupBondOrders(currgroup).length + " " + l);
                                int bondIndOne = structure.getGroupBondIndices(currgroup)[l * 2];
                                int bondIndTwo = structure.getGroupBondIndices(currgroup)[l * 2 + 1];
                                int bondOrder = structure.getGroupBondOrders(currgroup)[l];
                                bioAssembly.setGroupBond(bondIndOne, bondIndTwo, bondOrder);
                            }
                        }
                        // inc the groupIndex
                        groupIndex++;
                    }
                    if (addThisChain) {
                    // Add inter-group bond info
                    // for(int l = 0;  l < structure.getInterGroupBondOrders().length; l++){
                    // int bondIndOne = structure.getInterGroupBondIndices()[l*2];
                    // int bondIndTwo = structure.getInterGroupBondIndices()[l*2+1];
                    // int bondOrder = structure.getInterGroupBondOrders()[l];
                    // Integer indexOne = atomMap.get(bondIndOne);
                    // if (indexOne != null) {
                    // Integer indexTwo = atomMap.get(bondIndTwo);
                    // if (indexTwo != null) {
                    // bioAssembly.setInterGroupBond(indexOne, indexTwo, bondOrder);
                    // }
                    // }
                    }
                }
                // inc the chainIndex
                chainIndex++;
            }
            // inc the modelIndex
            modelIndex++;
        }
        bioAssembly.finalizeStructure();
        resList.add(new Tuple2<String, StructureDataInterface>(structureId, bioAssembly));
    }
    return resList.iterator();
}
Also used : ArrayList(java.util.ArrayList) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Matrix4d(javax.vecmath.Matrix4d) AdapterToStructureData(org.rcsb.mmtf.encoder.AdapterToStructureData) Point3f(javax.vecmath.Point3f) Tuple2(scala.Tuple2)

Example 32 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class ProteinFoldDatasetCreator method main.

/**
 * @param args
 * @throws IOException
 * @throws StructureException
 */
public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfReducedPath();
    if (args.length != 1) {
        System.err.println("Usage: " + ProteinFoldDatasetCreator.class.getSimpleName() + " <dataset output file");
        System.exit(1);
    }
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ProteinFoldDatasetCreator.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read MMTF Hadoop sequence file and create a non-redundant Pisces
    // subset set (<=20% seq. identity) of L-protein chains
    int sequenceIdentity = 20;
    double resolution = 3.0;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
    // get secondary structure content
    Dataset<Row> data = SecondaryStructureExtractor.getDataset(pdb);
    // classify chains by secondary structure type
    double minThreshold = 0.05;
    double maxThreshold = 0.15;
    data = addProteinFoldType(data, minThreshold, maxThreshold);
    // create a binary classification dataset
    data = data.filter("foldType = 'alpha' OR foldType = 'beta'").cache();
    // create a three-state classification model (alpha, beta, alpha+beta)
    // data = data.filter("foldType != 'other'").cache();
    // add Word2Vec encoded feature vector
    ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
    int n = 2;
    int windowSize = 11;
    int vectorSize = 50;
    data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);
    data.printSchema();
    data.show(25);
    // keep only a subset of relevant fields for further processing
    data = data.select("structureChainId", "alpha", "beta", "coil", "foldType", "features");
    data.write().mode("overwrite").format("parquet").save(args[0]);
    long end = System.nanoTime();
    System.out.println((end - start) / 1E9 + " sec");
}
Also used : ProteinSequenceEncoder(edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 33 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class SecondaryStructureElementsWord2VecEncoder method main.

public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfReducedPath();
    if (args.length != 0 && args.length != 2) {
        System.err.println("Usage: " + SecondaryStructureElementsWord2VecEncoder.class.getSimpleName() + " [<outputFilePath> + <fileFormat>]");
        System.exit(1);
    }
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureWord2VecEncoder.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read MMTF Hadoop sequence file and create a non-redundant Pisces
    // subset set (<=20% seq. identity) of L-protein chains
    int sequenceIdentity = 20;
    double resolution = 3.0;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
    int segmentLength = 11;
    // extract helical sequence segments
    Dataset<Row> data = SecondaryStructureElementExtractor.getDataset(pdb, "H", segmentLength);
    System.out.println(data.count());
    data.show(10, false);
    // add Word2Vec encoded feature vector
    ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
    int n = 2;
    int windowSize = (segmentLength - 1) / 2;
    int vectorSize = 50;
    data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);
    data.show(50, false);
    // optionally, save results
    if (args.length > 0) {
        if (args[1].equals("json")) {
            // coalesce data into a single file
            data = data.coalesce(1);
        }
        data.write().mode("overwrite").format(args[1]).save(args[0]);
    }
    long end = System.nanoTime();
    System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
Also used : ProteinSequenceEncoder(edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 34 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class SecondaryStructureShiftedWord2VecEncoder method main.

/**
 * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet)
 * @throws IOException
 * @throws StructureException
 */
public static void main(String[] args) throws IOException {
    String path = MmtfReader.getMmtfReducedPath();
    if (args.length != 2) {
        System.err.println("Usage: " + SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
        System.exit(1);
    }
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read MMTF Hadoop sequence file and create a non-redundant set (<=20% seq. identity)
    // of L-protein chains
    int sequenceIdentity = 20;
    double resolution = 3.0;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
    // get content
    int segmentLength = 11;
    Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);
    // create a Word2Vector representation of the protein sequences
    ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
    int windowSize = (segmentLength - 1) / 2;
    // dimension of feature vector	(50)
    int vectorSize = 50;
    data = encoder.shifted3GramWord2VecEncode(windowSize, vectorSize).cache();
    data.printSchema();
    data.show(25, false);
    if (args[1].equals("json")) {
        // coalesce data into a single file
        data = data.coalesce(1);
    }
    data.write().mode("overwrite").format(args[1]).save(args[0]);
    long end = System.nanoTime();
    System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
Also used : ProteinSequenceEncoder(edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) Pisces(edu.sdsc.mmtf.spark.webfilters.Pisces) StructureToPolymerChains(edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 35 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class MmtfReader method readMmtfFiles.

/**
 * Reads uncompressed and compressed MMTF files recursively from
 * a given directory.
 * This methods reads files with the mmtf or mmtf.gz extension.
 *
 * @param path Path to MMTF files
 * @param sc Spark context
 * @return structure data as keyword/value pairs
 */
public static JavaPairRDD<String, StructureDataInterface> readMmtfFiles(String path, JavaSparkContext sc) {
    return sc.parallelize(getFiles(path)).mapToPair(new PairFunction<File, String, StructureDataInterface>() {

        private static final long serialVersionUID = 9018971417443154996L;

        public Tuple2<String, StructureDataInterface> call(File f) throws Exception {
            try {
                if (f.toString().contains(".mmtf.gz")) {
                    InputStream in = new FileInputStream(f);
                    MmtfStructure mmtf = new MessagePackSerialization().deserialize(new GZIPInputStream(in));
                    return new Tuple2<String, StructureDataInterface>(f.getName().substring(0, f.getName().indexOf(".mmtf")), new GenericDecoder(mmtf));
                } else if (f.toString().contains(".mmtf")) {
                    InputStream in = new FileInputStream(f);
                    MmtfStructure mmtf = new MessagePackSerialization().deserialize(in);
                    return new Tuple2<String, StructureDataInterface>(f.getName().substring(0, f.getName().indexOf(".mmtf")), new GenericDecoder(mmtf));
                } else
                    return null;
            } catch (Exception e) {
                System.out.println(e);
                return null;
            }
        }
    }).filter(t -> t != null);
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) FileInputStream(java.io.FileInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) GenericDecoder(org.rcsb.mmtf.decoder.GenericDecoder) FileInputStream(java.io.FileInputStream) ZipException(java.util.zip.ZipException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) GZIPInputStream(java.util.zip.GZIPInputStream) Tuple2(scala.Tuple2) MessagePackSerialization(org.rcsb.mmtf.serialization.MessagePackSerialization) PairFunction(org.apache.spark.api.java.function.PairFunction) File(java.io.File) MmtfStructure(org.rcsb.mmtf.dataholders.MmtfStructure)

Aggregations

StructureDataInterface (org.rcsb.mmtf.api.StructureDataInterface)102 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)60 SparkConf (org.apache.spark.SparkConf)58 Row (org.apache.spark.sql.Row)27 StructureToPolymerChains (edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains)22 Test (org.junit.Test)20 Pisces (edu.sdsc.mmtf.spark.webfilters.Pisces)19 ArrayList (java.util.ArrayList)12 ProteinSequenceEncoder (edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder)10 ColumnarStructure (edu.sdsc.mmtf.spark.utils.ColumnarStructure)10 Tuple2 (scala.Tuple2)9 Path (java.nio.file.Path)7 HashSet (java.util.HashSet)7 AdapterToStructureData (org.rcsb.mmtf.encoder.AdapterToStructureData)7 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)6 ContainsLProteinChain (edu.sdsc.mmtf.spark.filters.ContainsLProteinChain)5 List (java.util.List)5 Resolution (edu.sdsc.mmtf.spark.filters.Resolution)4 MmtfReader (edu.sdsc.mmtf.spark.io.MmtfReader)4 File (java.io.File)4