Search in sources :

Example 1 with ChainIdentifier

use of de.bioforscher.jstructure.model.identifier.ChainIdentifier in project jstructure by JonStargaryen.

the class StructureParser method parseLine.

/**
 * Parses a single line of a <tt>PDB</tt> file.
 * @param line the line to processUniProtId
 */
private void parseLine(String line) {
    // TODO this is kinda hacky, however that way only the first model is parsed in any case
    if (passedFirstModel) {
        return;
    }
    // 63 - 66       IDcode        idCode            This identifier is unique within the PDB.
    if (line.startsWith(HEADER_PREFIX)) {
        // try to parse header line components - implicitly fallback values are provided by Structure's constructor
        try {
            String classification = line.substring(10, 50).trim();
            if (classification.isEmpty()) {
                classification = ProteinIdentifier.UNKNOWN_PROTEIN_IDENTIFIER.getAdditionalName();
            }
            protein.setClassification(classification);
        } catch (Exception e) {
            logger.warn("failed to parse classification from line '{}'", line, e);
        }
        try {
            LocalDate depositionDate = LocalDate.parse(line.substring(50, 59).trim(), StandardFormat.getPdbDateFormatInstance());
            if (depositionDate.isAfter(LocalDate.now())) {
                depositionDate = depositionDate.minusYears(100);
            }
            protein.setDepositionDate(depositionDate);
        } catch (Exception e) {
            // potential legacy header: 'HEADER    MEMBRANE PROTEIN, TRANSPORT PROTEIN, SIG 20-JUN-16   5KK2             '
            try {
                LocalDate depositionDate = LocalDate.parse(line.substring(51, 60).trim(), StandardFormat.getPdbDateFormatInstance());
                if (depositionDate.isAfter(LocalDate.now())) {
                    depositionDate = depositionDate.minusYears(100);
                }
                protein.setDepositionDate(depositionDate);
            } catch (Exception e2) {
                logger.warn("failed to parse depositionDate from line '{}'", line, e2);
            }
        }
        try {
            ProteinIdentifier proteinIdentifier = IdentifierFactory.createProteinIdentifier(line.substring(62, 66));
            protein.setProteinIdentifier(proteinIdentifier);
        } catch (Exception e) {
            // potential legacy header: 'HEADER    MEMBRANE PROTEIN, TRANSPORT PROTEIN, SIG 20-JUN-16   5KK2
            try {
                ProteinIdentifier proteinIdentifier = IdentifierFactory.createProteinIdentifier(line.substring(63, 67));
                protein.setProteinIdentifier(proteinIdentifier);
            } catch (Exception e2) {
                logger.warn("failed to parse the pdbId from line '{}'", line, e2);
            }
        }
    }
    // 11 - 80       String        title         Title of the experiment.
    if (line.startsWith(TITLE_PREFIX)) {
        // trim to omit tailing white-spaces
        // extra whitespace to ensure that words are separated
        // maybe some StringJoiner is the way to go
        titleString.append(titleString.length() == 0 ? "" : " ").append(line.substring(10, line.length() < 80 ? line.length() : 80).trim());
    }
    if (line.startsWith(TER_PREFIX)) {
        // mark chain as terminated - everything parsed from now on, associated to this chain will be an HETATM
        Chain chainToTerminate = protein.select().chainName(line.length() > 22 ? line.substring(21, 22) : "?").asOptionalChain().orElse(currentChain);
        terminatedChains.add(chainToTerminate);
    }
    // parsing atom record - information we need is marked with an '*' - indirectly needed information (chain/residue) marked with an '#'
    // some information will inform us about changing chain/residue
    /*	COLUMNS        DATA TYPE     FIELD        DEFINITION
			-------------------------------------------------------------------------------------
			1 - 6          Record name   "ATOM  "
		*	7 - 11   	   Integer       serial       Atom serial number.
		*	13 - 16        Atom          name         Atom name.
			17             Character     altLoc       Alternate location indicator.
		#	18 - 20        Residue name  resName      Residue name.
		#	22             Character     chainID      Chain identifier.
		#	23 - 26        Integer       resSeq       Residue sequence number.
			27             AChar         iCode        Code for insertion of residues.
		*	31 - 38        Real(8.3)     x            Orthogonal coordinates for X in Angstroms.
		*	39 - 46        Real(8.3)     y            Orthogonal coordinates for Y in Angstroms.
		*	47 - 54        Real(8.3)     z            Orthogonal coordinates for Z in Angstroms.
			55 - 60        Real(6.2)    occupancy     Occupancy.
			61 - 66        Real(6.2)    tempFactor    Temperature factor.
		*	77 - 78        LString(2)   element       Element symbol, right justified.
			79 - 80        LString(2)   charge        Charge on the atom */
    boolean isAtomLine = line.startsWith(Atom.ATOM_PREFIX);
    boolean isHetAtmLine = line.startsWith(Atom.HETATM_PREFIX);
    // option to skip hetatm lines - used in the aaRS project
    if (skipHetAtms && isHetAtmLine) {
        return;
    }
    if (isAtomLine || isHetAtmLine) {
        String atomName = line.substring(12, 16).trim();
        String pdbName = line.substring(17, 20).trim();
        String elementName = line.substring(76, 78).trim();
        Element element;
        if (elementName.isEmpty()) {
            // fallback for PDB files lacking annotation of elements
            element = Element.resolveFullAtomName(atomName, isHetAtmLine);
        } else {
            element = Element.resolveElementSymbol(elementName);
        }
        if (skipHydrogens && element.isHydrogen()) {
            return;
        }
        String alternativeLocationIndicator = line.substring(16, 17).trim();
        String rawChainId = line.substring(21, 22);
        rawChainId = rawChainId.equals(" ") ? Chain.UNKNOWN_CHAIN.getChainIdentifier().getChainId() : rawChainId;
        ChainIdentifier chainId = IdentifierFactory.createChainIdentifier(protein.getProteinIdentifier(), rawChainId);
        int resNum = Integer.parseInt(line.substring(22, 26).trim());
        String insertionCode = line.substring(26, 27).trim();
        if (currentChain == null || !currentChain.getChainIdentifier().equals(chainId)) {
            Optional<Chain> selectedChain = protein.select().chainName(chainId.getChainId()).asOptionalChain();
            if (selectedChain.isPresent()) {
                // chain already present - just an het-group not directly connected
                currentChain = selectedChain.get();
            } else {
                // chain changed - create new chain object and set reference
                currentChain = new Chain(chainId);
                protein.addChain(currentChain);
            }
        }
        if (currentGroup == null || currentGroup.getResidueIdentifier().getResidueNumber() != resNum || !currentGroup.getResidueIdentifier().getInsertionCode().equals(insertionCode) || !currentGroup.getParentChain().getChainIdentifier().equals(chainId)) {
            // residue changed - create new group object and set reference
            currentGroup = createGroup(pdbName, IdentifierFactory.createResidueIdentifier(resNum, insertionCode), terminatedChains.contains(currentChain), minimalParsing);
            currentChain.addGroup(currentGroup);
        }
        float occupancy;
        try {
            occupancy = Float.valueOf(line.substring(54, 60).trim());
        } catch (NumberFormatException e) {
            if (strictMode) {
                throw new ParsingException(e);
            } else {
                logger.debug("missing occupancy in line{}'{}'", System.lineSeparator(), line);
                occupancy = Atom.DEFAULT_OCCUPANCY;
            }
        }
        float bfactor;
        try {
            bfactor = Float.valueOf(line.substring(60, 66).trim());
        } catch (NumberFormatException e) {
            if (strictMode) {
                throw new ParsingException(e);
            } else {
                logger.debug("missing bfactor in line{}'{}'", System.lineSeparator(), line);
                bfactor = Atom.DEFAULT_BFACTOR;
            }
        }
        // we append the current group with additional atoms
        Atom atom = Atom.builder(element, new double[] { Double.valueOf(line.substring(30, 38).trim()), Double.valueOf(line.substring(38, 46).trim()), Double.valueOf(line.substring(46, 54).trim()) }).name(atomName).pdbSerial(Integer.valueOf(line.substring(6, 11).trim())).occupancy(occupancy).bfactor(bfactor).alternativeLocation(alternativeLocationIndicator).build();
        // 17/05/22 - stopping to skip alternative positions
        currentGroup.addAtom(atom);
    }
    if (line.startsWith(END_MODEL_PREFIX)) {
        // TODO handling of multiple models
        passedFirstModel = true;
        logger.debug("skipping models for {}", protein.getProteinIdentifier().getFullName());
    }
}
Also used : ChainIdentifier(de.bioforscher.jstructure.model.identifier.ChainIdentifier) LocalDate(java.time.LocalDate) ProteinIdentifier(de.bioforscher.jstructure.model.identifier.ProteinIdentifier)

Example 2 with ChainIdentifier

use of de.bioforscher.jstructure.model.identifier.ChainIdentifier in project jstructure by JonStargaryen.

the class StructureParser method updateProteinIdentifier.

private void updateProteinIdentifier(ProteinIdentifier proteinIdentifier) {
    protein.setProteinIdentifier(proteinIdentifier);
    protein.chains().forEach(chain -> {
        ChainIdentifier chainIdentifier = IdentifierFactory.createChainIdentifier(proteinIdentifier, chain.getChainIdentifier().getChainId());
        chain.setChainIdentifier(chainIdentifier);
    });
}
Also used : ChainIdentifier(de.bioforscher.jstructure.model.identifier.ChainIdentifier)

Example 3 with ChainIdentifier

use of de.bioforscher.jstructure.model.identifier.ChainIdentifier in project jstructure by JonStargaryen.

the class EarlyFoldingClassificationRunner method process.

public void process(Structure structure, Path outputPath) throws IOException {
    // report structure characteristics
    System.out.println("structure: " + structure.getProteinIdentifier().getFullName() + "\n" + "chains: " + structure.chainsWithAminoAcids().map(Chain::getChainIdentifier).map(ChainIdentifier::getChainId).collect(Collectors.toList()) + "\n" + "total residues: " + structure.aminoAcids().count());
    System.out.println();
    // compute features
    System.out.println("computing residue-level features");
    // start with PLIP to fail fast
    System.out.println("querying PLIP-REST-Service");
    try {
        // try to annotate by standard routine
        PLIP_INTRA_MOLECULAR_ANNOTATOR.process(structure);
        System.out.println("fetched PLIP contacts");
    } catch (Exception e1) {
        try {
            // potential non-pdb-entry, try to compute on-the-fly
            structure.chainsWithAminoAcids().forEach(chain -> {
                Document document = PLIPRestServiceQuery.calculateIntraChainDocument(chain);
                PLIP_INTRA_MOLECULAR_ANNOTATOR.process(chain, document);
            });
            System.out.println("computed PLIP contacts");
        } catch (Exception e2) {
            System.out.println("failed: could not compute PLIP contacts");
            e2.printStackTrace();
            return;
        }
    }
    System.out.println("computing energy profiles");
    EGOR_AGREEMENT_CALCULATOR.process(structure);
    System.out.println("annotating secondary structure elements");
    LOOP_FRACTION_CALCULATOR.process(structure);
    System.out.println("computing relative accessible surface area");
    ACCESSIBLE_SURFACE_AREA_CALCULATOR.process(structure);
    // assign feature vectors
    structure.aminoAcids().forEach(RawFeatureVector::assignRawFeatureVector);
    // smooth feature vectors
    structure.chainsWithAminoAcids().forEach(chain -> {
        List<AminoAcid> aminoAcids = chain.aminoAcids().collect(Collectors.toList());
        aminoAcids.forEach(aminoAcid -> {
            SmoothedFeatureVector.assignSmoothedFeatureVector(aminoAcids, aminoAcid);
        });
    });
    // classify each residue
    StringJoiner outputJoiner = new StringJoiner(System.lineSeparator());
    // print header
    outputJoiner.add("structure: '" + structure.getProteinIdentifier().getFullName() + "'").add("chains: " + structure.chainsWithAminoAcids().map(Chain::getChainIdentifier).map(ChainIdentifier::getChainId).collect(Collectors.toList())).add("total residues: " + structure.aminoAcids().count()).add("chain,res,aa,sse,energy,egor,sse_size,loop_fraction,rasa,plip_local_contacts," + "plip_local_hbonds,plip_local_hydrophobic,plip_local_backbone,plip_long_range_contacts," + "plip_long_range_hbonds,plip_long_range_hydrophobic,plip_long_range_backbone," + "plip_betweenness,plip_closeness,plip_clusteringcoefficient,plip_hbonds_betweenness," + "plip_hbonds_closeness,plip_hbonds_clusteringcoefficient,plip_hydrophobic_betweenness," + "plip_hydrophobic_closeness,plip_hydrophobic_clusteringcoefficient,conv_betweenness," + "conv_closeness,conv_clusteringcoefficient,plip_neighborhoods,conv_neighborhoods,prob,folds");
    structure.chainsWithAminoAcids().forEach(chain -> {
        List<String> output = structure.aminoAcids().map(aminoAcid -> {
            boolean isProline = aminoAcid instanceof Proline;
            SmoothedFeatureVector smoothedFeatureVector = aminoAcid.getFeature(SmoothedFeatureVector.class);
            double loopFraction = aminoAcid.getFeature(LoopFraction.class).getLoopFraction();
            Instance instance = createInstance(smoothedFeatureVector, loopFraction);
            double prob = 0.0;
            if (!isProline) {
                try {
                    prob = model.distributionForInstance(normalize(instance))[0];
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
            StringJoiner lineJoiner = new StringJoiner(",");
            lineJoiner.add(aminoAcid.getParentChain().getChainIdentifier().getChainId()).add(aminoAcid.getResidueIdentifier().toString()).add(aminoAcid.getOneLetterCode()).add(aminoAcid.getFeature(GenericSecondaryStructure.class).getSecondaryStructure().getReducedRepresentation());
            for (int i = 0; i < instance.numAttributes() - 1; i++) {
                lineJoiner.add(StandardFormat.format(instance.value(i)));
            }
            lineJoiner.add(StandardFormat.format(prob));
            return lineJoiner.toString();
        }).sorted(Comparator.comparingDouble((String line) -> Double.valueOf(line.split(",")[line.split(",").length - 1])).reversed()).collect(Collectors.toList());
        int numberOfEarlyFoldingResidues = (int) (0.15 * (int) chain.aminoAcids().count());
        int counter = 0;
        for (int i = 0; i < chain.aminoAcids().count(); i++) {
            outputJoiner.add(output.get(i) + "," + (counter < numberOfEarlyFoldingResidues ? "early" : "late"));
            counter++;
        }
    });
    // write output
    System.out.println("writing output to " + outputPath);
    Files.write(outputPath, outputJoiner.toString().getBytes());
}
Also used : java.util(java.util) LoopFraction(de.bioforscher.jstructure.feature.loopfraction.LoopFraction) Structure(de.bioforscher.jstructure.model.structure.Structure) GenericSecondaryStructure(de.bioforscher.jstructure.feature.sse.GenericSecondaryStructure) StructureParser(de.bioforscher.jstructure.model.structure.StructureParser) LoopFractionCalculator(de.bioforscher.jstructure.feature.loopfraction.LoopFractionCalculator) Classifier(weka.classifiers.Classifier) EgorAgreementCalculator(de.bioforscher.jstructure.feature.energyprofile.EgorAgreementCalculator) ProteinIdentifier(de.bioforscher.jstructure.model.identifier.ProteinIdentifier) ChainIdentifier(de.bioforscher.jstructure.model.identifier.ChainIdentifier) AminoAcid(de.bioforscher.jstructure.model.structure.aminoacid.AminoAcid) Chain(de.bioforscher.jstructure.model.structure.Chain) PLIPRestServiceQuery(de.bioforscher.jstructure.feature.interactions.PLIPRestServiceQuery) StandardFormat(de.bioforscher.jstructure.StandardFormat) PLIPIntraMolecularAnnotator(de.bioforscher.jstructure.feature.interactions.PLIPIntraMolecularAnnotator) Path(java.nio.file.Path) RawFeatureVector(de.bioforscher.start2fold.model.vector.RawFeatureVector) Proline(de.bioforscher.jstructure.model.structure.aminoacid.Proline) Files(java.nio.file.Files) AccessibleSurfaceAreaCalculator(de.bioforscher.jstructure.feature.asa.AccessibleSurfaceAreaCalculator) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) IdentifierFactory(de.bioforscher.jstructure.model.identifier.IdentifierFactory) Paths(java.nio.file.Paths) Document(org.jsoup.nodes.Document) weka.core(weka.core) SmoothedFeatureVector(de.bioforscher.start2fold.model.vector.SmoothedFeatureVector) InputStream(java.io.InputStream) Chain(de.bioforscher.jstructure.model.structure.Chain) ChainIdentifier(de.bioforscher.jstructure.model.identifier.ChainIdentifier) AminoAcid(de.bioforscher.jstructure.model.structure.aminoacid.AminoAcid) SmoothedFeatureVector(de.bioforscher.start2fold.model.vector.SmoothedFeatureVector) Proline(de.bioforscher.jstructure.model.structure.aminoacid.Proline) GenericSecondaryStructure(de.bioforscher.jstructure.feature.sse.GenericSecondaryStructure) Document(org.jsoup.nodes.Document) IOException(java.io.IOException) LoopFraction(de.bioforscher.jstructure.feature.loopfraction.LoopFraction) RawFeatureVector(de.bioforscher.start2fold.model.vector.RawFeatureVector)

Aggregations

ChainIdentifier (de.bioforscher.jstructure.model.identifier.ChainIdentifier)3 ProteinIdentifier (de.bioforscher.jstructure.model.identifier.ProteinIdentifier)2 StandardFormat (de.bioforscher.jstructure.StandardFormat)1 AccessibleSurfaceAreaCalculator (de.bioforscher.jstructure.feature.asa.AccessibleSurfaceAreaCalculator)1 EgorAgreementCalculator (de.bioforscher.jstructure.feature.energyprofile.EgorAgreementCalculator)1 PLIPIntraMolecularAnnotator (de.bioforscher.jstructure.feature.interactions.PLIPIntraMolecularAnnotator)1 PLIPRestServiceQuery (de.bioforscher.jstructure.feature.interactions.PLIPRestServiceQuery)1 LoopFraction (de.bioforscher.jstructure.feature.loopfraction.LoopFraction)1 LoopFractionCalculator (de.bioforscher.jstructure.feature.loopfraction.LoopFractionCalculator)1 GenericSecondaryStructure (de.bioforscher.jstructure.feature.sse.GenericSecondaryStructure)1 IdentifierFactory (de.bioforscher.jstructure.model.identifier.IdentifierFactory)1 Chain (de.bioforscher.jstructure.model.structure.Chain)1 Structure (de.bioforscher.jstructure.model.structure.Structure)1 StructureParser (de.bioforscher.jstructure.model.structure.StructureParser)1 AminoAcid (de.bioforscher.jstructure.model.structure.aminoacid.AminoAcid)1 Proline (de.bioforscher.jstructure.model.structure.aminoacid.Proline)1 RawFeatureVector (de.bioforscher.start2fold.model.vector.RawFeatureVector)1 SmoothedFeatureVector (de.bioforscher.start2fold.model.vector.SmoothedFeatureVector)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1