Search in sources :

Example 21 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class StructureToProteinDimers method getAllAtomsDistanceBoxes.

private static List<DistanceBox<Integer>> getAllAtomsDistanceBoxes(List<StructureDataInterface> chains, double cutoffDistance) {
    List<DistanceBox<Integer>> distanceBoxes = new ArrayList<DistanceBox<Integer>>();
    for (int i = 0; i < chains.size(); i++) {
        StructureDataInterface tmp = chains.get(i);
        DistanceBox<Integer> newBox = new DistanceBox<Integer>(cutoffDistance);
        // System.out.println(tmp.getNumAtoms());
        for (int j = 0; j < tmp.getNumAtoms(); j++) {
            double xCoord = tmp.getxCoords()[j];
            double yCoord = tmp.getyCoords()[j];
            double zCoord = tmp.getzCoords()[j];
            Point3d newPoint = new Point3d(xCoord, yCoord, zCoord);
            // System.out.println(newPoint);
            newBox.addPoint(newPoint, j);
        }
        distanceBoxes.add(newBox);
    }
    return distanceBoxes;
}
Also used : Point3d(javax.vecmath.Point3d) ArrayList(java.util.ArrayList) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) DistanceBox(org.biojava.nbio.structure.symmetry.geometry.DistanceBox)

Example 22 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class StructureToProteinDimers method call.

@Override
public Iterator<Tuple2<String, StructureDataInterface>> call(Tuple2<String, StructureDataInterface> t) throws Exception {
    StructureDataInterface structure = t._2;
    List<Tuple2<String, StructureDataInterface>> resList = new ArrayList<>();
    // split the structure into a list of structure of chains
    List<StructureDataInterface> chains = splitToChains(structure);
    List<Vector3d> chainVectors = getChainVectors(chains);
    // for each chain, create a distance box
    List<DistanceBox<Integer>> boxes;
    if (useAllAtoms == true)
        boxes = getAllAtomsDistanceBoxes(chains, cutoffDistance);
    else
        boxes = getCBetaAtomsDistanceBoxes(chains, cutoffDistance);
    List<Vector3d> exclusiveList = new ArrayList<Vector3d>();
    // loop through chains
    for (int i = 0; i < chains.size(); i++) {
        for (int j = 0; j < i; j++) {
            // for each pair of chains, check if they are in contact or not
            if (checkPair(boxes.get(i), boxes.get(j), chains.get(i), chains.get(j), cutoffDistance, contacts)) {
                if (exclusive) {
                    // String es1 = chains.get(i).getEntitySequence(getChainToEntityIndex(chains.get(i))[0]);
                    // String es2 = chains.get(j).getEntitySequence(getChainToEntityIndex(chains.get(j))[0]);
                    Vector3d newVec = calcDiff(chainVectors.get(i), chainVectors.get(j));
                    // System.out.println(newVec);
                    if (!checkList(newVec, exclusiveList)) {
                        resList.add(combineChains(chains.get(i), chains.get(j)));
                        exclusiveList.add(newVec);
                    }
                } else
                    resList.add(combineChains(chains.get(i), chains.get(j)));
            }
        }
    }
    // System.out.println(exclusiveList);
    return resList.iterator();
}
Also used : Tuple2(scala.Tuple2) ArrayList(java.util.ArrayList) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) DistanceBox(org.biojava.nbio.structure.symmetry.geometry.DistanceBox)

Example 23 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class StructureToProteinDimers method combineChains.

/**
 * A method that takes two structure of chains and return a single structur of two chains.
 */
private static Tuple2<String, StructureDataInterface> combineChains(StructureDataInterface s1, StructureDataInterface s2) {
    int groupCounter = 0;
    int atomCounter = 0;
    String structureId = s1.getStructureId() + "_append_" + s2.getStructureId();
    AdapterToStructureData combinedStructure = new AdapterToStructureData();
    combinedStructure.initStructure(s1.getNumBonds() + s2.getNumBonds(), s1.getNumAtoms() + s2.getNumAtoms(), s1.getNumGroups() + s2.getNumGroups(), 2, 1, structureId);
    DecoderUtils.addXtalographicInfo(s1, combinedStructure);
    DecoderUtils.addHeaderInfo(s1, combinedStructure);
    combinedStructure.setModelInfo(0, 2);
    // set entity and chain info
    combinedStructure.setEntityInfo(new int[] { 0 }, s1.getEntitySequence(getChainToEntityIndex(s1)[0]), s1.getEntityDescription(getChainToEntityIndex(s1)[0]), s1.getEntityType(getChainToEntityIndex(s1)[0]));
    combinedStructure.setChainInfo(s1.getChainIds()[0], s1.getChainNames()[0], s1.getGroupsPerChain()[0]);
    for (int i = 0; i < s1.getGroupsPerChain()[0]; i++, groupCounter++) {
        int groupIndex = s1.getGroupTypeIndices()[groupCounter];
        // set group info
        combinedStructure.setGroupInfo(s1.getGroupName(groupIndex), s1.getGroupIds()[groupCounter], s1.getInsCodes()[groupCounter], s1.getGroupChemCompType(groupIndex), s1.getNumAtomsInGroup(groupIndex), s1.getGroupBondOrders(groupIndex).length, s1.getGroupSingleLetterCode(groupIndex), s1.getGroupSequenceIndices()[groupCounter], s1.getSecStructList()[groupCounter]);
        for (int j = 0; j < s1.getNumAtomsInGroup(groupIndex); j++, atomCounter++) {
            combinedStructure.setAtomInfo(s1.getGroupAtomNames(groupIndex)[j], s1.getAtomIds()[atomCounter], s1.getAltLocIds()[atomCounter], s1.getxCoords()[atomCounter], s1.getyCoords()[atomCounter], s1.getzCoords()[atomCounter], s1.getOccupancies()[atomCounter], s1.getbFactors()[atomCounter], s1.getGroupElementNames(groupIndex)[j], s1.getGroupAtomCharges(groupIndex)[j]);
        }
        // TODO : not sure if we should add bonds like this.
        for (int j = 0; j < s1.getGroupBondOrders(groupIndex).length; j++) {
            int bondIndOne = s1.getGroupBondIndices(groupIndex)[j * 2];
            int bondIndTwo = s1.getGroupBondIndices(groupIndex)[j * 2 + 1];
            int bondOrder = s1.getGroupBondOrders(groupIndex)[j];
            combinedStructure.setGroupBond(bondIndOne, bondIndTwo, bondOrder);
        }
    }
    // set entity and chain info
    combinedStructure.setEntityInfo(new int[] { 1 }, s1.getEntitySequence(getChainToEntityIndex(s2)[0]), s2.getEntityDescription(getChainToEntityIndex(s2)[0]), s2.getEntityType(getChainToEntityIndex(s2)[0]));
    combinedStructure.setChainInfo(s2.getChainIds()[0], s2.getChainNames()[0], s2.getGroupsPerChain()[0]);
    groupCounter = 0;
    atomCounter = 0;
    for (int i = 0; i < s2.getGroupsPerChain()[0]; i++, groupCounter++) {
        int groupIndex = s2.getGroupTypeIndices()[groupCounter];
        // set group info
        combinedStructure.setGroupInfo(s2.getGroupName(groupIndex), s2.getGroupIds()[groupCounter], s2.getInsCodes()[groupCounter], s2.getGroupChemCompType(groupIndex), s2.getNumAtomsInGroup(groupIndex), s2.getGroupBondOrders(groupIndex).length, s2.getGroupSingleLetterCode(groupIndex), s2.getGroupSequenceIndices()[groupCounter], s2.getSecStructList()[groupCounter]);
        for (int j = 0; j < s2.getNumAtomsInGroup(groupIndex); j++, atomCounter++) {
            combinedStructure.setAtomInfo(s2.getGroupAtomNames(groupIndex)[j], s2.getAtomIds()[atomCounter], s2.getAltLocIds()[atomCounter], s2.getxCoords()[atomCounter], s2.getyCoords()[atomCounter], s2.getzCoords()[atomCounter], s2.getOccupancies()[atomCounter], s2.getbFactors()[atomCounter], s2.getGroupElementNames(groupIndex)[j], s2.getGroupAtomCharges(groupIndex)[j]);
        }
        // TODO : not sure if we should add bonds like this.
        for (int j = 0; j < s2.getGroupBondOrders(groupIndex).length; j++) {
            int bondIndOne = s2.getGroupBondIndices(groupIndex)[j * 2];
            int bondIndTwo = s2.getGroupBondIndices(groupIndex)[j * 2 + 1];
            int bondOrder = s2.getGroupBondOrders(groupIndex)[j];
            combinedStructure.setGroupBond(bondIndOne, bondIndTwo, bondOrder);
        }
    }
    combinedStructure.finalizeStructure();
    return (new Tuple2<String, StructureDataInterface>(structureId, combinedStructure));
}
Also used : AdapterToStructureData(org.rcsb.mmtf.encoder.AdapterToStructureData) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface)

Example 24 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class StructureToProteinDimers method splitToChains.

private static List<StructureDataInterface> splitToChains(StructureDataInterface s) {
    List<StructureDataInterface> chains = new ArrayList<StructureDataInterface>();
    int numChains = s.getChainsPerModel()[0];
    int[] chainToEntityIndex = getChainToEntityIndex(s);
    int[] atomsPerChain = new int[numChains];
    int[] bondsPerChain = new int[numChains];
    getNumAtomsAndBonds(s, atomsPerChain, bondsPerChain);
    for (int i = 0, atomCounter = 0, groupCounter = 0; i < numChains; i++) {
        AdapterToStructureData newChain = new AdapterToStructureData();
        int entityToChainIndex = chainToEntityIndex[i];
        Map<Integer, Integer> atomMap = new HashMap<>();
        // to avoid of information loss, add chainName/IDs and entity id
        // this required by some queries
        String structureId = s.getStructureId() + "." + s.getChainNames()[i] + "." + s.getChainIds()[i] + "." + (entityToChainIndex + 1);
        // set header
        newChain.initStructure(bondsPerChain[i], atomsPerChain[i], s.getGroupsPerChain()[i], 1, 1, structureId);
        DecoderUtils.addXtalographicInfo(s, newChain);
        DecoderUtils.addHeaderInfo(s, newChain);
        // set model info (only one model: 0)
        newChain.setModelInfo(0, 1);
        // set entity and chain info
        newChain.setEntityInfo(new int[] { 0 }, s.getEntitySequence(entityToChainIndex), s.getEntityDescription(entityToChainIndex), s.getEntityType(entityToChainIndex));
        newChain.setChainInfo(s.getChainIds()[i], s.getChainNames()[i], s.getGroupsPerChain()[i]);
        for (int j = 0; j < s.getGroupsPerChain()[i]; j++, groupCounter++) {
            int groupIndex = s.getGroupTypeIndices()[groupCounter];
            // set group info
            newChain.setGroupInfo(s.getGroupName(groupIndex), s.getGroupIds()[groupCounter], s.getInsCodes()[groupCounter], s.getGroupChemCompType(groupIndex), s.getNumAtomsInGroup(groupIndex), s.getGroupBondOrders(groupIndex).length, s.getGroupSingleLetterCode(groupIndex), s.getGroupSequenceIndices()[groupCounter], s.getSecStructList()[groupCounter]);
            for (int k = 0; k < s.getNumAtomsInGroup(groupIndex); k++, atomCounter++) {
                newChain.setAtomInfo(s.getGroupAtomNames(groupIndex)[k], s.getAtomIds()[atomCounter], s.getAltLocIds()[atomCounter], s.getxCoords()[atomCounter], s.getyCoords()[atomCounter], s.getzCoords()[atomCounter], s.getOccupancies()[atomCounter], s.getbFactors()[atomCounter], s.getGroupElementNames(groupIndex)[k], s.getGroupAtomCharges(groupIndex)[k]);
            }
            // add intra-group bond info
            for (int l = 0; l < s.getGroupBondOrders(groupIndex).length; l++) {
                int bondIndOne = s.getGroupBondIndices(groupIndex)[l * 2];
                int bondIndTwo = s.getGroupBondIndices(groupIndex)[l * 2 + 1];
                int bondOrder = s.getGroupBondOrders(groupIndex)[l];
                newChain.setGroupBond(bondIndOne, bondIndTwo, bondOrder);
            }
        }
        // Add inter-group bond info
        for (int ii = 0; ii < s.getInterGroupBondOrders().length; ii++) {
            int bondIndOne = s.getInterGroupBondIndices()[ii * 2];
            int bondIndTwo = s.getInterGroupBondIndices()[ii * 2 + 1];
            int bondOrder = s.getInterGroupBondOrders()[ii];
            Integer indexOne = atomMap.get(bondIndOne);
            if (indexOne != null) {
                Integer indexTwo = atomMap.get(bondIndTwo);
                if (indexTwo != null) {
                    newChain.setInterGroupBond(indexOne, indexTwo, bondOrder);
                }
            }
        }
        newChain.finalizeStructure();
        if (EncoderUtils.getTypeFromChainId(newChain, 0).equals("polymer")) {
            boolean match = true;
            for (int j = 0; j < newChain.getGroupsPerChain()[0]; j++) {
                if (match) {
                    int groupIndex = newChain.getGroupTypeIndices()[j];
                    String type = newChain.getGroupChemCompType(groupIndex);
                    // System.out.println(j + " " + type);
                    match = type.equals(ContainsPolymerChainType.L_PEPTIDE_LINKING) || type.equals(ContainsPolymerChainType.PEPTIDE_LINKING);
                }
            }
            if (match)
                chains.add(newChain);
        }
    }
    return chains;
}
Also used : AdapterToStructureData(org.rcsb.mmtf.encoder.AdapterToStructureData) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface)

Example 25 with StructureDataInterface

use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.

the class MapReduceExample method main.

/**
 * Counts the number of atoms in the PDB using the classic
 * map-reduce algorithm
 *
 * @throws FileNotFoundException
 */
public static void main(String[] args) throws FileNotFoundException {
    long start = System.nanoTime();
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(MapReduceExample.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // read PDB from MMTF-Hadoop sequence file
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readFullSequenceFile(sc);
    // count number of atoms
    long numAtoms = pdb.map(t -> t._2.getNumAtoms()).reduce((a, b) -> a + b);
    System.out.println("Total number of atoms in PDB: " + numAtoms);
    long end = System.nanoTime();
    System.out.println("Time: " + (end - start) / 1E9 + " sec.");
    sc.close();
}
Also used : StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) SparkConf(org.apache.spark.SparkConf) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) MmtfReader(edu.sdsc.mmtf.spark.io.MmtfReader) FileNotFoundException(java.io.FileNotFoundException) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) SparkConf(org.apache.spark.SparkConf)

Aggregations

StructureDataInterface (org.rcsb.mmtf.api.StructureDataInterface)102 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)60 SparkConf (org.apache.spark.SparkConf)58 Row (org.apache.spark.sql.Row)27 StructureToPolymerChains (edu.sdsc.mmtf.spark.mappers.StructureToPolymerChains)22 Test (org.junit.Test)20 Pisces (edu.sdsc.mmtf.spark.webfilters.Pisces)19 ArrayList (java.util.ArrayList)12 ProteinSequenceEncoder (edu.sdsc.mmtf.spark.ml.ProteinSequenceEncoder)10 ColumnarStructure (edu.sdsc.mmtf.spark.utils.ColumnarStructure)10 Tuple2 (scala.Tuple2)9 Path (java.nio.file.Path)7 HashSet (java.util.HashSet)7 AdapterToStructureData (org.rcsb.mmtf.encoder.AdapterToStructureData)7 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)6 ContainsLProteinChain (edu.sdsc.mmtf.spark.filters.ContainsLProteinChain)5 List (java.util.List)5 Resolution (edu.sdsc.mmtf.spark.filters.Resolution)4 MmtfReader (edu.sdsc.mmtf.spark.io.MmtfReader)4 File (java.io.File)4