use of de.bioforscher.jstructure.model.identifier.ProteinIdentifier in project jstructure by JonStargaryen.
the class StructureParser method parseLine.
/**
* Parses a single line of a <tt>PDB</tt> file.
* @param line the line to processUniProtId
*/
private void parseLine(String line) {
// TODO this is kinda hacky, however that way only the first model is parsed in any case
if (passedFirstModel) {
return;
}
// 63 - 66 IDcode idCode This identifier is unique within the PDB.
if (line.startsWith(HEADER_PREFIX)) {
// try to parse header line components - implicitly fallback values are provided by Structure's constructor
try {
String classification = line.substring(10, 50).trim();
if (classification.isEmpty()) {
classification = ProteinIdentifier.UNKNOWN_PROTEIN_IDENTIFIER.getAdditionalName();
}
protein.setClassification(classification);
} catch (Exception e) {
logger.warn("failed to parse classification from line '{}'", line, e);
}
try {
LocalDate depositionDate = LocalDate.parse(line.substring(50, 59).trim(), StandardFormat.getPdbDateFormatInstance());
if (depositionDate.isAfter(LocalDate.now())) {
depositionDate = depositionDate.minusYears(100);
}
protein.setDepositionDate(depositionDate);
} catch (Exception e) {
// potential legacy header: 'HEADER MEMBRANE PROTEIN, TRANSPORT PROTEIN, SIG 20-JUN-16 5KK2 '
try {
LocalDate depositionDate = LocalDate.parse(line.substring(51, 60).trim(), StandardFormat.getPdbDateFormatInstance());
if (depositionDate.isAfter(LocalDate.now())) {
depositionDate = depositionDate.minusYears(100);
}
protein.setDepositionDate(depositionDate);
} catch (Exception e2) {
logger.warn("failed to parse depositionDate from line '{}'", line, e2);
}
}
try {
ProteinIdentifier proteinIdentifier = IdentifierFactory.createProteinIdentifier(line.substring(62, 66));
protein.setProteinIdentifier(proteinIdentifier);
} catch (Exception e) {
// potential legacy header: 'HEADER MEMBRANE PROTEIN, TRANSPORT PROTEIN, SIG 20-JUN-16 5KK2
try {
ProteinIdentifier proteinIdentifier = IdentifierFactory.createProteinIdentifier(line.substring(63, 67));
protein.setProteinIdentifier(proteinIdentifier);
} catch (Exception e2) {
logger.warn("failed to parse the pdbId from line '{}'", line, e2);
}
}
}
// 11 - 80 String title Title of the experiment.
if (line.startsWith(TITLE_PREFIX)) {
// trim to omit tailing white-spaces
// extra whitespace to ensure that words are separated
// maybe some StringJoiner is the way to go
titleString.append(titleString.length() == 0 ? "" : " ").append(line.substring(10, line.length() < 80 ? line.length() : 80).trim());
}
if (line.startsWith(TER_PREFIX)) {
// mark chain as terminated - everything parsed from now on, associated to this chain will be an HETATM
Chain chainToTerminate = protein.select().chainName(line.length() > 22 ? line.substring(21, 22) : "?").asOptionalChain().orElse(currentChain);
terminatedChains.add(chainToTerminate);
}
// parsing atom record - information we need is marked with an '*' - indirectly needed information (chain/residue) marked with an '#'
// some information will inform us about changing chain/residue
/* COLUMNS DATA TYPE FIELD DEFINITION
-------------------------------------------------------------------------------------
1 - 6 Record name "ATOM "
* 7 - 11 Integer serial Atom serial number.
* 13 - 16 Atom name Atom name.
17 Character altLoc Alternate location indicator.
# 18 - 20 Residue name resName Residue name.
# 22 Character chainID Chain identifier.
# 23 - 26 Integer resSeq Residue sequence number.
27 AChar iCode Code for insertion of residues.
* 31 - 38 Real(8.3) x Orthogonal coordinates for X in Angstroms.
* 39 - 46 Real(8.3) y Orthogonal coordinates for Y in Angstroms.
* 47 - 54 Real(8.3) z Orthogonal coordinates for Z in Angstroms.
55 - 60 Real(6.2) occupancy Occupancy.
61 - 66 Real(6.2) tempFactor Temperature factor.
* 77 - 78 LString(2) element Element symbol, right justified.
79 - 80 LString(2) charge Charge on the atom */
boolean isAtomLine = line.startsWith(Atom.ATOM_PREFIX);
boolean isHetAtmLine = line.startsWith(Atom.HETATM_PREFIX);
// option to skip hetatm lines - used in the aaRS project
if (skipHetAtms && isHetAtmLine) {
return;
}
if (isAtomLine || isHetAtmLine) {
String atomName = line.substring(12, 16).trim();
String pdbName = line.substring(17, 20).trim();
String elementName = line.substring(76, 78).trim();
Element element;
if (elementName.isEmpty()) {
// fallback for PDB files lacking annotation of elements
element = Element.resolveFullAtomName(atomName, isHetAtmLine);
} else {
element = Element.resolveElementSymbol(elementName);
}
if (skipHydrogens && element.isHydrogen()) {
return;
}
String alternativeLocationIndicator = line.substring(16, 17).trim();
String rawChainId = line.substring(21, 22);
rawChainId = rawChainId.equals(" ") ? Chain.UNKNOWN_CHAIN.getChainIdentifier().getChainId() : rawChainId;
ChainIdentifier chainId = IdentifierFactory.createChainIdentifier(protein.getProteinIdentifier(), rawChainId);
int resNum = Integer.parseInt(line.substring(22, 26).trim());
String insertionCode = line.substring(26, 27).trim();
if (currentChain == null || !currentChain.getChainIdentifier().equals(chainId)) {
Optional<Chain> selectedChain = protein.select().chainName(chainId.getChainId()).asOptionalChain();
if (selectedChain.isPresent()) {
// chain already present - just an het-group not directly connected
currentChain = selectedChain.get();
} else {
// chain changed - create new chain object and set reference
currentChain = new Chain(chainId);
protein.addChain(currentChain);
}
}
if (currentGroup == null || currentGroup.getResidueIdentifier().getResidueNumber() != resNum || !currentGroup.getResidueIdentifier().getInsertionCode().equals(insertionCode) || !currentGroup.getParentChain().getChainIdentifier().equals(chainId)) {
// residue changed - create new group object and set reference
currentGroup = createGroup(pdbName, IdentifierFactory.createResidueIdentifier(resNum, insertionCode), terminatedChains.contains(currentChain), minimalParsing);
currentChain.addGroup(currentGroup);
}
float occupancy;
try {
occupancy = Float.valueOf(line.substring(54, 60).trim());
} catch (NumberFormatException e) {
if (strictMode) {
throw new ParsingException(e);
} else {
logger.debug("missing occupancy in line{}'{}'", System.lineSeparator(), line);
occupancy = Atom.DEFAULT_OCCUPANCY;
}
}
float bfactor;
try {
bfactor = Float.valueOf(line.substring(60, 66).trim());
} catch (NumberFormatException e) {
if (strictMode) {
throw new ParsingException(e);
} else {
logger.debug("missing bfactor in line{}'{}'", System.lineSeparator(), line);
bfactor = Atom.DEFAULT_BFACTOR;
}
}
// we append the current group with additional atoms
Atom atom = Atom.builder(element, new double[] { Double.valueOf(line.substring(30, 38).trim()), Double.valueOf(line.substring(38, 46).trim()), Double.valueOf(line.substring(46, 54).trim()) }).name(atomName).pdbSerial(Integer.valueOf(line.substring(6, 11).trim())).occupancy(occupancy).bfactor(bfactor).alternativeLocation(alternativeLocationIndicator).build();
// 17/05/22 - stopping to skip alternative positions
currentGroup.addAtom(atom);
}
if (line.startsWith(END_MODEL_PREFIX)) {
// TODO handling of multiple models
passedFirstModel = true;
logger.debug("skipping models for {}", protein.getProteinIdentifier().getFullName());
}
}
use of de.bioforscher.jstructure.model.identifier.ProteinIdentifier in project jstructure by JonStargaryen.
the class EarlyFoldingClassificationRunner method main.
public static void main(String[] args) throws IOException {
if (args.length != 2) {
System.out.println("incorrect number of arguments\n" + "usage: java -jar efr.jar input output\n" + "input: either '/a/path/to/some/file.pdb' or '1pdb' a pdb-id\n" + "output: either '/a/absolute/path/output.csv' or 'relative.csv' a relative path");
return;
}
String input = args[0];
String output = args[1];
Path outputPath = Paths.get(output);
System.out.println();
// parse structure - classic flow-control by exceptions
Structure structure;
try {
ProteinIdentifier pdbId = IdentifierFactory.createProteinIdentifier(input);
structure = StructureParser.fromProteinIdentifier(pdbId).parse();
System.out.println("parsing structure by pdb-id: '" + pdbId + "'");
} catch (Exception e) {
System.out.println("parsing structure from file at: '" + input + "'");
Path inputPath = Paths.get(input);
structure = StructureParser.fromPath(inputPath).parse();
}
System.out.println();
EarlyFoldingClassificationRunner instance = getInstance();
instance.process(structure, outputPath);
}
Aggregations