use of de.bioforscher.jstructure.model.identifier.ChainIdentifier in project jstructure by JonStargaryen.
the class StructureParser method parseLine.
/**
* Parses a single line of a <tt>PDB</tt> file.
* @param line the line to processUniProtId
*/
private void parseLine(String line) {
// TODO this is kinda hacky, however that way only the first model is parsed in any case
if (passedFirstModel) {
return;
}
// 63 - 66 IDcode idCode This identifier is unique within the PDB.
if (line.startsWith(HEADER_PREFIX)) {
// try to parse header line components - implicitly fallback values are provided by Structure's constructor
try {
String classification = line.substring(10, 50).trim();
if (classification.isEmpty()) {
classification = ProteinIdentifier.UNKNOWN_PROTEIN_IDENTIFIER.getAdditionalName();
}
protein.setClassification(classification);
} catch (Exception e) {
logger.warn("failed to parse classification from line '{}'", line, e);
}
try {
LocalDate depositionDate = LocalDate.parse(line.substring(50, 59).trim(), StandardFormat.getPdbDateFormatInstance());
if (depositionDate.isAfter(LocalDate.now())) {
depositionDate = depositionDate.minusYears(100);
}
protein.setDepositionDate(depositionDate);
} catch (Exception e) {
// potential legacy header: 'HEADER MEMBRANE PROTEIN, TRANSPORT PROTEIN, SIG 20-JUN-16 5KK2 '
try {
LocalDate depositionDate = LocalDate.parse(line.substring(51, 60).trim(), StandardFormat.getPdbDateFormatInstance());
if (depositionDate.isAfter(LocalDate.now())) {
depositionDate = depositionDate.minusYears(100);
}
protein.setDepositionDate(depositionDate);
} catch (Exception e2) {
logger.warn("failed to parse depositionDate from line '{}'", line, e2);
}
}
try {
ProteinIdentifier proteinIdentifier = IdentifierFactory.createProteinIdentifier(line.substring(62, 66));
protein.setProteinIdentifier(proteinIdentifier);
} catch (Exception e) {
// potential legacy header: 'HEADER MEMBRANE PROTEIN, TRANSPORT PROTEIN, SIG 20-JUN-16 5KK2
try {
ProteinIdentifier proteinIdentifier = IdentifierFactory.createProteinIdentifier(line.substring(63, 67));
protein.setProteinIdentifier(proteinIdentifier);
} catch (Exception e2) {
logger.warn("failed to parse the pdbId from line '{}'", line, e2);
}
}
}
// 11 - 80 String title Title of the experiment.
if (line.startsWith(TITLE_PREFIX)) {
// trim to omit tailing white-spaces
// extra whitespace to ensure that words are separated
// maybe some StringJoiner is the way to go
titleString.append(titleString.length() == 0 ? "" : " ").append(line.substring(10, line.length() < 80 ? line.length() : 80).trim());
}
if (line.startsWith(TER_PREFIX)) {
// mark chain as terminated - everything parsed from now on, associated to this chain will be an HETATM
Chain chainToTerminate = protein.select().chainName(line.length() > 22 ? line.substring(21, 22) : "?").asOptionalChain().orElse(currentChain);
terminatedChains.add(chainToTerminate);
}
// parsing atom record - information we need is marked with an '*' - indirectly needed information (chain/residue) marked with an '#'
// some information will inform us about changing chain/residue
/* COLUMNS DATA TYPE FIELD DEFINITION
-------------------------------------------------------------------------------------
1 - 6 Record name "ATOM "
* 7 - 11 Integer serial Atom serial number.
* 13 - 16 Atom name Atom name.
17 Character altLoc Alternate location indicator.
# 18 - 20 Residue name resName Residue name.
# 22 Character chainID Chain identifier.
# 23 - 26 Integer resSeq Residue sequence number.
27 AChar iCode Code for insertion of residues.
* 31 - 38 Real(8.3) x Orthogonal coordinates for X in Angstroms.
* 39 - 46 Real(8.3) y Orthogonal coordinates for Y in Angstroms.
* 47 - 54 Real(8.3) z Orthogonal coordinates for Z in Angstroms.
55 - 60 Real(6.2) occupancy Occupancy.
61 - 66 Real(6.2) tempFactor Temperature factor.
* 77 - 78 LString(2) element Element symbol, right justified.
79 - 80 LString(2) charge Charge on the atom */
boolean isAtomLine = line.startsWith(Atom.ATOM_PREFIX);
boolean isHetAtmLine = line.startsWith(Atom.HETATM_PREFIX);
// option to skip hetatm lines - used in the aaRS project
if (skipHetAtms && isHetAtmLine) {
return;
}
if (isAtomLine || isHetAtmLine) {
String atomName = line.substring(12, 16).trim();
String pdbName = line.substring(17, 20).trim();
String elementName = line.substring(76, 78).trim();
Element element;
if (elementName.isEmpty()) {
// fallback for PDB files lacking annotation of elements
element = Element.resolveFullAtomName(atomName, isHetAtmLine);
} else {
element = Element.resolveElementSymbol(elementName);
}
if (skipHydrogens && element.isHydrogen()) {
return;
}
String alternativeLocationIndicator = line.substring(16, 17).trim();
String rawChainId = line.substring(21, 22);
rawChainId = rawChainId.equals(" ") ? Chain.UNKNOWN_CHAIN.getChainIdentifier().getChainId() : rawChainId;
ChainIdentifier chainId = IdentifierFactory.createChainIdentifier(protein.getProteinIdentifier(), rawChainId);
int resNum = Integer.parseInt(line.substring(22, 26).trim());
String insertionCode = line.substring(26, 27).trim();
if (currentChain == null || !currentChain.getChainIdentifier().equals(chainId)) {
Optional<Chain> selectedChain = protein.select().chainName(chainId.getChainId()).asOptionalChain();
if (selectedChain.isPresent()) {
// chain already present - just an het-group not directly connected
currentChain = selectedChain.get();
} else {
// chain changed - create new chain object and set reference
currentChain = new Chain(chainId);
protein.addChain(currentChain);
}
}
if (currentGroup == null || currentGroup.getResidueIdentifier().getResidueNumber() != resNum || !currentGroup.getResidueIdentifier().getInsertionCode().equals(insertionCode) || !currentGroup.getParentChain().getChainIdentifier().equals(chainId)) {
// residue changed - create new group object and set reference
currentGroup = createGroup(pdbName, IdentifierFactory.createResidueIdentifier(resNum, insertionCode), terminatedChains.contains(currentChain), minimalParsing);
currentChain.addGroup(currentGroup);
}
float occupancy;
try {
occupancy = Float.valueOf(line.substring(54, 60).trim());
} catch (NumberFormatException e) {
if (strictMode) {
throw new ParsingException(e);
} else {
logger.debug("missing occupancy in line{}'{}'", System.lineSeparator(), line);
occupancy = Atom.DEFAULT_OCCUPANCY;
}
}
float bfactor;
try {
bfactor = Float.valueOf(line.substring(60, 66).trim());
} catch (NumberFormatException e) {
if (strictMode) {
throw new ParsingException(e);
} else {
logger.debug("missing bfactor in line{}'{}'", System.lineSeparator(), line);
bfactor = Atom.DEFAULT_BFACTOR;
}
}
// we append the current group with additional atoms
Atom atom = Atom.builder(element, new double[] { Double.valueOf(line.substring(30, 38).trim()), Double.valueOf(line.substring(38, 46).trim()), Double.valueOf(line.substring(46, 54).trim()) }).name(atomName).pdbSerial(Integer.valueOf(line.substring(6, 11).trim())).occupancy(occupancy).bfactor(bfactor).alternativeLocation(alternativeLocationIndicator).build();
// 17/05/22 - stopping to skip alternative positions
currentGroup.addAtom(atom);
}
if (line.startsWith(END_MODEL_PREFIX)) {
// TODO handling of multiple models
passedFirstModel = true;
logger.debug("skipping models for {}", protein.getProteinIdentifier().getFullName());
}
}
use of de.bioforscher.jstructure.model.identifier.ChainIdentifier in project jstructure by JonStargaryen.
the class StructureParser method updateProteinIdentifier.
private void updateProteinIdentifier(ProteinIdentifier proteinIdentifier) {
protein.setProteinIdentifier(proteinIdentifier);
protein.chains().forEach(chain -> {
ChainIdentifier chainIdentifier = IdentifierFactory.createChainIdentifier(proteinIdentifier, chain.getChainIdentifier().getChainId());
chain.setChainIdentifier(chainIdentifier);
});
}
use of de.bioforscher.jstructure.model.identifier.ChainIdentifier in project jstructure by JonStargaryen.
the class EarlyFoldingClassificationRunner method process.
public void process(Structure structure, Path outputPath) throws IOException {
// report structure characteristics
System.out.println("structure: " + structure.getProteinIdentifier().getFullName() + "\n" + "chains: " + structure.chainsWithAminoAcids().map(Chain::getChainIdentifier).map(ChainIdentifier::getChainId).collect(Collectors.toList()) + "\n" + "total residues: " + structure.aminoAcids().count());
System.out.println();
// compute features
System.out.println("computing residue-level features");
// start with PLIP to fail fast
System.out.println("querying PLIP-REST-Service");
try {
// try to annotate by standard routine
PLIP_INTRA_MOLECULAR_ANNOTATOR.process(structure);
System.out.println("fetched PLIP contacts");
} catch (Exception e1) {
try {
// potential non-pdb-entry, try to compute on-the-fly
structure.chainsWithAminoAcids().forEach(chain -> {
Document document = PLIPRestServiceQuery.calculateIntraChainDocument(chain);
PLIP_INTRA_MOLECULAR_ANNOTATOR.process(chain, document);
});
System.out.println("computed PLIP contacts");
} catch (Exception e2) {
System.out.println("failed: could not compute PLIP contacts");
e2.printStackTrace();
return;
}
}
System.out.println("computing energy profiles");
EGOR_AGREEMENT_CALCULATOR.process(structure);
System.out.println("annotating secondary structure elements");
LOOP_FRACTION_CALCULATOR.process(structure);
System.out.println("computing relative accessible surface area");
ACCESSIBLE_SURFACE_AREA_CALCULATOR.process(structure);
// assign feature vectors
structure.aminoAcids().forEach(RawFeatureVector::assignRawFeatureVector);
// smooth feature vectors
structure.chainsWithAminoAcids().forEach(chain -> {
List<AminoAcid> aminoAcids = chain.aminoAcids().collect(Collectors.toList());
aminoAcids.forEach(aminoAcid -> {
SmoothedFeatureVector.assignSmoothedFeatureVector(aminoAcids, aminoAcid);
});
});
// classify each residue
StringJoiner outputJoiner = new StringJoiner(System.lineSeparator());
// print header
outputJoiner.add("structure: '" + structure.getProteinIdentifier().getFullName() + "'").add("chains: " + structure.chainsWithAminoAcids().map(Chain::getChainIdentifier).map(ChainIdentifier::getChainId).collect(Collectors.toList())).add("total residues: " + structure.aminoAcids().count()).add("chain,res,aa,sse,energy,egor,sse_size,loop_fraction,rasa,plip_local_contacts," + "plip_local_hbonds,plip_local_hydrophobic,plip_local_backbone,plip_long_range_contacts," + "plip_long_range_hbonds,plip_long_range_hydrophobic,plip_long_range_backbone," + "plip_betweenness,plip_closeness,plip_clusteringcoefficient,plip_hbonds_betweenness," + "plip_hbonds_closeness,plip_hbonds_clusteringcoefficient,plip_hydrophobic_betweenness," + "plip_hydrophobic_closeness,plip_hydrophobic_clusteringcoefficient,conv_betweenness," + "conv_closeness,conv_clusteringcoefficient,plip_neighborhoods,conv_neighborhoods,prob,folds");
structure.chainsWithAminoAcids().forEach(chain -> {
List<String> output = structure.aminoAcids().map(aminoAcid -> {
boolean isProline = aminoAcid instanceof Proline;
SmoothedFeatureVector smoothedFeatureVector = aminoAcid.getFeature(SmoothedFeatureVector.class);
double loopFraction = aminoAcid.getFeature(LoopFraction.class).getLoopFraction();
Instance instance = createInstance(smoothedFeatureVector, loopFraction);
double prob = 0.0;
if (!isProline) {
try {
prob = model.distributionForInstance(normalize(instance))[0];
} catch (Exception e) {
e.printStackTrace();
}
}
StringJoiner lineJoiner = new StringJoiner(",");
lineJoiner.add(aminoAcid.getParentChain().getChainIdentifier().getChainId()).add(aminoAcid.getResidueIdentifier().toString()).add(aminoAcid.getOneLetterCode()).add(aminoAcid.getFeature(GenericSecondaryStructure.class).getSecondaryStructure().getReducedRepresentation());
for (int i = 0; i < instance.numAttributes() - 1; i++) {
lineJoiner.add(StandardFormat.format(instance.value(i)));
}
lineJoiner.add(StandardFormat.format(prob));
return lineJoiner.toString();
}).sorted(Comparator.comparingDouble((String line) -> Double.valueOf(line.split(",")[line.split(",").length - 1])).reversed()).collect(Collectors.toList());
int numberOfEarlyFoldingResidues = (int) (0.15 * (int) chain.aminoAcids().count());
int counter = 0;
for (int i = 0; i < chain.aminoAcids().count(); i++) {
outputJoiner.add(output.get(i) + "," + (counter < numberOfEarlyFoldingResidues ? "early" : "late"));
counter++;
}
});
// write output
System.out.println("writing output to " + outputPath);
Files.write(outputPath, outputJoiner.toString().getBytes());
}
Aggregations