Search in sources :

Example 1 with ProteinIdentifier

use of de.bioforscher.jstructure.model.identifier.ProteinIdentifier in project jstructure by JonStargaryen.

the class StructureParser method parseLine.

/**
 * Parses a single line of a <tt>PDB</tt> file.
 * @param line the line to processUniProtId
 */
private void parseLine(String line) {
    // TODO this is kinda hacky, however that way only the first model is parsed in any case
    if (passedFirstModel) {
        return;
    }
    // 63 - 66       IDcode        idCode            This identifier is unique within the PDB.
    if (line.startsWith(HEADER_PREFIX)) {
        // try to parse header line components - implicitly fallback values are provided by Structure's constructor
        try {
            String classification = line.substring(10, 50).trim();
            if (classification.isEmpty()) {
                classification = ProteinIdentifier.UNKNOWN_PROTEIN_IDENTIFIER.getAdditionalName();
            }
            protein.setClassification(classification);
        } catch (Exception e) {
            logger.warn("failed to parse classification from line '{}'", line, e);
        }
        try {
            LocalDate depositionDate = LocalDate.parse(line.substring(50, 59).trim(), StandardFormat.getPdbDateFormatInstance());
            if (depositionDate.isAfter(LocalDate.now())) {
                depositionDate = depositionDate.minusYears(100);
            }
            protein.setDepositionDate(depositionDate);
        } catch (Exception e) {
            // potential legacy header: 'HEADER    MEMBRANE PROTEIN, TRANSPORT PROTEIN, SIG 20-JUN-16   5KK2             '
            try {
                LocalDate depositionDate = LocalDate.parse(line.substring(51, 60).trim(), StandardFormat.getPdbDateFormatInstance());
                if (depositionDate.isAfter(LocalDate.now())) {
                    depositionDate = depositionDate.minusYears(100);
                }
                protein.setDepositionDate(depositionDate);
            } catch (Exception e2) {
                logger.warn("failed to parse depositionDate from line '{}'", line, e2);
            }
        }
        try {
            ProteinIdentifier proteinIdentifier = IdentifierFactory.createProteinIdentifier(line.substring(62, 66));
            protein.setProteinIdentifier(proteinIdentifier);
        } catch (Exception e) {
            // potential legacy header: 'HEADER    MEMBRANE PROTEIN, TRANSPORT PROTEIN, SIG 20-JUN-16   5KK2
            try {
                ProteinIdentifier proteinIdentifier = IdentifierFactory.createProteinIdentifier(line.substring(63, 67));
                protein.setProteinIdentifier(proteinIdentifier);
            } catch (Exception e2) {
                logger.warn("failed to parse the pdbId from line '{}'", line, e2);
            }
        }
    }
    // 11 - 80       String        title         Title of the experiment.
    if (line.startsWith(TITLE_PREFIX)) {
        // trim to omit tailing white-spaces
        // extra whitespace to ensure that words are separated
        // maybe some StringJoiner is the way to go
        titleString.append(titleString.length() == 0 ? "" : " ").append(line.substring(10, line.length() < 80 ? line.length() : 80).trim());
    }
    if (line.startsWith(TER_PREFIX)) {
        // mark chain as terminated - everything parsed from now on, associated to this chain will be an HETATM
        Chain chainToTerminate = protein.select().chainName(line.length() > 22 ? line.substring(21, 22) : "?").asOptionalChain().orElse(currentChain);
        terminatedChains.add(chainToTerminate);
    }
    // parsing atom record - information we need is marked with an '*' - indirectly needed information (chain/residue) marked with an '#'
    // some information will inform us about changing chain/residue
    /*	COLUMNS        DATA TYPE     FIELD        DEFINITION
			-------------------------------------------------------------------------------------
			1 - 6          Record name   "ATOM  "
		*	7 - 11   	   Integer       serial       Atom serial number.
		*	13 - 16        Atom          name         Atom name.
			17             Character     altLoc       Alternate location indicator.
		#	18 - 20        Residue name  resName      Residue name.
		#	22             Character     chainID      Chain identifier.
		#	23 - 26        Integer       resSeq       Residue sequence number.
			27             AChar         iCode        Code for insertion of residues.
		*	31 - 38        Real(8.3)     x            Orthogonal coordinates for X in Angstroms.
		*	39 - 46        Real(8.3)     y            Orthogonal coordinates for Y in Angstroms.
		*	47 - 54        Real(8.3)     z            Orthogonal coordinates for Z in Angstroms.
			55 - 60        Real(6.2)    occupancy     Occupancy.
			61 - 66        Real(6.2)    tempFactor    Temperature factor.
		*	77 - 78        LString(2)   element       Element symbol, right justified.
			79 - 80        LString(2)   charge        Charge on the atom */
    boolean isAtomLine = line.startsWith(Atom.ATOM_PREFIX);
    boolean isHetAtmLine = line.startsWith(Atom.HETATM_PREFIX);
    // option to skip hetatm lines - used in the aaRS project
    if (skipHetAtms && isHetAtmLine) {
        return;
    }
    if (isAtomLine || isHetAtmLine) {
        String atomName = line.substring(12, 16).trim();
        String pdbName = line.substring(17, 20).trim();
        String elementName = line.substring(76, 78).trim();
        Element element;
        if (elementName.isEmpty()) {
            // fallback for PDB files lacking annotation of elements
            element = Element.resolveFullAtomName(atomName, isHetAtmLine);
        } else {
            element = Element.resolveElementSymbol(elementName);
        }
        if (skipHydrogens && element.isHydrogen()) {
            return;
        }
        String alternativeLocationIndicator = line.substring(16, 17).trim();
        String rawChainId = line.substring(21, 22);
        rawChainId = rawChainId.equals(" ") ? Chain.UNKNOWN_CHAIN.getChainIdentifier().getChainId() : rawChainId;
        ChainIdentifier chainId = IdentifierFactory.createChainIdentifier(protein.getProteinIdentifier(), rawChainId);
        int resNum = Integer.parseInt(line.substring(22, 26).trim());
        String insertionCode = line.substring(26, 27).trim();
        if (currentChain == null || !currentChain.getChainIdentifier().equals(chainId)) {
            Optional<Chain> selectedChain = protein.select().chainName(chainId.getChainId()).asOptionalChain();
            if (selectedChain.isPresent()) {
                // chain already present - just an het-group not directly connected
                currentChain = selectedChain.get();
            } else {
                // chain changed - create new chain object and set reference
                currentChain = new Chain(chainId);
                protein.addChain(currentChain);
            }
        }
        if (currentGroup == null || currentGroup.getResidueIdentifier().getResidueNumber() != resNum || !currentGroup.getResidueIdentifier().getInsertionCode().equals(insertionCode) || !currentGroup.getParentChain().getChainIdentifier().equals(chainId)) {
            // residue changed - create new group object and set reference
            currentGroup = createGroup(pdbName, IdentifierFactory.createResidueIdentifier(resNum, insertionCode), terminatedChains.contains(currentChain), minimalParsing);
            currentChain.addGroup(currentGroup);
        }
        float occupancy;
        try {
            occupancy = Float.valueOf(line.substring(54, 60).trim());
        } catch (NumberFormatException e) {
            if (strictMode) {
                throw new ParsingException(e);
            } else {
                logger.debug("missing occupancy in line{}'{}'", System.lineSeparator(), line);
                occupancy = Atom.DEFAULT_OCCUPANCY;
            }
        }
        float bfactor;
        try {
            bfactor = Float.valueOf(line.substring(60, 66).trim());
        } catch (NumberFormatException e) {
            if (strictMode) {
                throw new ParsingException(e);
            } else {
                logger.debug("missing bfactor in line{}'{}'", System.lineSeparator(), line);
                bfactor = Atom.DEFAULT_BFACTOR;
            }
        }
        // we append the current group with additional atoms
        Atom atom = Atom.builder(element, new double[] { Double.valueOf(line.substring(30, 38).trim()), Double.valueOf(line.substring(38, 46).trim()), Double.valueOf(line.substring(46, 54).trim()) }).name(atomName).pdbSerial(Integer.valueOf(line.substring(6, 11).trim())).occupancy(occupancy).bfactor(bfactor).alternativeLocation(alternativeLocationIndicator).build();
        // 17/05/22 - stopping to skip alternative positions
        currentGroup.addAtom(atom);
    }
    if (line.startsWith(END_MODEL_PREFIX)) {
        // TODO handling of multiple models
        passedFirstModel = true;
        logger.debug("skipping models for {}", protein.getProteinIdentifier().getFullName());
    }
}
Also used : ChainIdentifier(de.bioforscher.jstructure.model.identifier.ChainIdentifier) LocalDate(java.time.LocalDate) ProteinIdentifier(de.bioforscher.jstructure.model.identifier.ProteinIdentifier)

Example 2 with ProteinIdentifier

use of de.bioforscher.jstructure.model.identifier.ProteinIdentifier in project jstructure by JonStargaryen.

the class EarlyFoldingClassificationRunner method main.

public static void main(String[] args) throws IOException {
    if (args.length != 2) {
        System.out.println("incorrect number of arguments\n" + "usage: java -jar efr.jar input output\n" + "input: either '/a/path/to/some/file.pdb' or '1pdb' a pdb-id\n" + "output: either '/a/absolute/path/output.csv' or 'relative.csv' a relative path");
        return;
    }
    String input = args[0];
    String output = args[1];
    Path outputPath = Paths.get(output);
    System.out.println();
    // parse structure - classic flow-control by exceptions
    Structure structure;
    try {
        ProteinIdentifier pdbId = IdentifierFactory.createProteinIdentifier(input);
        structure = StructureParser.fromProteinIdentifier(pdbId).parse();
        System.out.println("parsing structure by pdb-id: '" + pdbId + "'");
    } catch (Exception e) {
        System.out.println("parsing structure from file at: '" + input + "'");
        Path inputPath = Paths.get(input);
        structure = StructureParser.fromPath(inputPath).parse();
    }
    System.out.println();
    EarlyFoldingClassificationRunner instance = getInstance();
    instance.process(structure, outputPath);
}
Also used : Path(java.nio.file.Path) ProteinIdentifier(de.bioforscher.jstructure.model.identifier.ProteinIdentifier) Structure(de.bioforscher.jstructure.model.structure.Structure) GenericSecondaryStructure(de.bioforscher.jstructure.feature.sse.GenericSecondaryStructure) IOException(java.io.IOException)

Aggregations

ProteinIdentifier (de.bioforscher.jstructure.model.identifier.ProteinIdentifier)2 GenericSecondaryStructure (de.bioforscher.jstructure.feature.sse.GenericSecondaryStructure)1 ChainIdentifier (de.bioforscher.jstructure.model.identifier.ChainIdentifier)1 Structure (de.bioforscher.jstructure.model.structure.Structure)1 IOException (java.io.IOException)1 Path (java.nio.file.Path)1 LocalDate (java.time.LocalDate)1