use of de.bioforscher.jstructure.model.structure.Structure in project jstructure by JonStargaryen.
the class A04_WriteTransitionStateCsv method handleLineLocally.
private static Optional<String> handleLineLocally(String line) {
try {
System.out.println(line);
String[] split = line.split(";");
String entryId = split[0];
String pdbId = split[1];
List<Integer> experimentIds = Pattern.compile(",").splitAsStream(split[2].replaceAll("\\[", "").replaceAll("]", "")).map(Integer::valueOf).collect(Collectors.toList());
Structure structure = StructureParser.fromPdbId(pdbId).parse();
Chain originalChain = structure.chains().findFirst().get();
ProteinGraph originalFullPlipGraph = ProteinGraphFactory.createProteinGraph(originalChain, ProteinGraphFactory.InteractionScheme.SALENTIN2015);
ProteinGraph originalHydrogenPlipGraph = ProteinGraphFactory.createProteinGraph(originalChain, ProteinGraphFactory.InteractionScheme.SALENTIN2015_HYDROGEN_BONDS);
ProteinGraph originalHydrophobicPlipGraph = ProteinGraphFactory.createProteinGraph(originalChain, ProteinGraphFactory.InteractionScheme.SALENTIN2015_HYDROPHOBIC_INTERACTION);
ProteinGraph originalConvGraph = ProteinGraphFactory.createProteinGraph(originalChain, ProteinGraphFactory.InteractionScheme.CALPHA8);
Start2FoldXmlParser.parseSpecificExperiment(originalChain, Start2FoldConstants.XML_DIRECTORY.resolve(entryId + ".xml"), experimentIds);
List<AminoAcid> earlyFoldingResidues = originalChain.aminoAcids().filter(aminoAcid -> aminoAcid.getFeature(Start2FoldResidueAnnotation.class).isEarly()).collect(Collectors.toList());
List<Chain> reconstructedChains = Files.list(Paths.get("/home/bittrich/git/phd_sb_repo/data/" + "reconstruction-start2fold/reconstructions/" + pdbId + "-early-conventional-1/stage1/")).filter(path -> path.toFile().getName().contains("_model")).map(path -> StructureParser.fromPath(path).forceProteinName(IdentifierFactory.createProteinIdentifier(pdbId, path.toFile().getName().split("_")[2].split("\\.")[0])).parse().getChains().get(0)).collect(Collectors.toList());
for (Chain reconstructedChain : reconstructedChains) {
Document document = PLIPRestServiceQuery.calculateIntraChainDocument(reconstructedChain);
PLIP_INTRA_MOLECULAR_ANNOTATOR.process(originalChain, document);
}
List<ProteinGraph> convGraphs = reconstructedChains.stream().map(c -> ProteinGraphFactory.createProteinGraph(c, ProteinGraphFactory.InteractionScheme.CALPHA8)).collect(Collectors.toList());
List<ProteinGraphCalculations> convGraphCalculations = convGraphs.stream().map(ProteinGraphCalculations::new).collect(Collectors.toList());
List<ProteinGraph> fullPlipGraphs = reconstructedChains.stream().map(c -> ProteinGraphFactory.createProteinGraph(c, ProteinGraphFactory.InteractionScheme.SALENTIN2015)).collect(Collectors.toList());
List<ProteinGraphCalculations> fullPlipGraphCalculations = fullPlipGraphs.stream().map(ProteinGraphCalculations::new).collect(Collectors.toList());
List<ProteinGraph> hydrogenPlipGraphs = reconstructedChains.stream().map(c -> ProteinGraphFactory.createProteinGraph(c, ProteinGraphFactory.InteractionScheme.SALENTIN2015_HYDROGEN_BONDS)).collect(Collectors.toList());
List<ProteinGraphCalculations> hydrogenPlipGraphCalculations = fullPlipGraphs.stream().map(ProteinGraphCalculations::new).collect(Collectors.toList());
List<ProteinGraph> hydrophobicPlipGraphs = reconstructedChains.stream().map(c -> ProteinGraphFactory.createProteinGraph(c, ProteinGraphFactory.InteractionScheme.SALENTIN2015_HYDROPHOBIC_INTERACTION)).collect(Collectors.toList());
List<ProteinGraphCalculations> hydrophobicPlipGraphCalculations = fullPlipGraphs.stream().map(ProteinGraphCalculations::new).collect(Collectors.toList());
return Optional.of(originalChain.aminoAcids().map(aminoAcid -> {
ResidueTopologicPropertiesContainer container = aminoAcid.getFeature(ResidueTopologicPropertiesContainer.class);
ResidueIdentifier residueIdentifier = aminoAcid.getResidueIdentifier();
return pdbId + "," + "A" + "," + aminoAcid.getResidueIdentifier() + "," + aminoAcid.getOneLetterCode() + "," + originalFullPlipGraph.getContactsOf(aminoAcid).size() + "," + originalFullPlipGraph.getLocalContactsOf(aminoAcid).size() + "," + originalFullPlipGraph.getNonLocalContactsOf(aminoAcid).size() + "," + StandardFormat.format(container.getFullPlip().getBetweenness()) + "," + StandardFormat.format(container.getFullPlip().getCloseness()) + "," + StandardFormat.format(container.getFullPlip().getClusteringCoefficient()) + "," + originalHydrogenPlipGraph.getContactsOf(aminoAcid).size() + "," + originalHydrogenPlipGraph.getLocalContactsOf(aminoAcid).size() + "," + originalHydrogenPlipGraph.getNonLocalContactsOf(aminoAcid).size() + "," + StandardFormat.format(container.getHydrogenPlip().getBetweenness()) + "," + StandardFormat.format(container.getHydrogenPlip().getCloseness()) + "," + StandardFormat.format(container.getHydrogenPlip().getClusteringCoefficient()) + "," + originalHydrophobicPlipGraph.getContactsOf(aminoAcid).size() + "," + originalHydrophobicPlipGraph.getLocalContactsOf(aminoAcid).size() + "," + originalHydrophobicPlipGraph.getNonLocalContactsOf(aminoAcid).size() + "," + StandardFormat.format(container.getHydrophobicPlip().getBetweenness()) + "," + StandardFormat.format(container.getHydrophobicPlip().getCloseness()) + "," + StandardFormat.format(container.getHydrophobicPlip().getClusteringCoefficient()) + "," + originalConvGraph.getContactsOf(aminoAcid).size() + "," + originalConvGraph.getLocalContactsOf(aminoAcid).size() + "," + originalConvGraph.getNonLocalContactsOf(aminoAcid).size() + "," + StandardFormat.format(container.getConventional().getBetweenness()) + "," + StandardFormat.format(container.getConventional().getCloseness()) + "," + StandardFormat.format(container.getConventional().getClusteringCoefficient()) + "," + container.getFullPlip().getDistinctNeighborhoodCount() + "," + container.getConventional().getDistinctNeighborhoodCount() + "," + StandardFormat.format(aminoAcid.getFeature(EnergyProfile.class).getSolvationEnergy()) + "," + StandardFormat.format(aminoAcid.getFeature(AccessibleSurfaceArea.class).getRelativeAccessibleSurfaceArea()) + "," + StandardFormat.format(aminoAcid.getFeature(LoopFraction.class).getLoopFraction()) + "," + (earlyFoldingResidues.contains(aminoAcid) ? "early" : "late") + "," + "native" + System.lineSeparator() + pdbId + "," + "A" + "," + aminoAcid.getResidueIdentifier() + "," + aminoAcid.getOneLetterCode() + "," + StandardFormat.format(fullPlipGraphs.stream().mapToInt(proteinGraph -> proteinGraph.getContactsOf(residueIdentifier).size()).average().getAsDouble()) + "," + StandardFormat.format(fullPlipGraphs.stream().mapToInt(proteinGraph -> proteinGraph.getLocalContactsOf(residueIdentifier).size()).average().getAsDouble()) + "," + StandardFormat.format(fullPlipGraphs.stream().mapToInt(proteinGraph -> proteinGraph.getNonLocalContactsOf(residueIdentifier).size()).average().getAsDouble()) + "," + StandardFormat.format(fullPlipGraphCalculations.stream().mapToDouble(proteinGraphCalculations -> proteinGraphCalculations.betweenness(residueIdentifier)).average().getAsDouble()) + "," + StandardFormat.format(fullPlipGraphCalculations.stream().mapToDouble(proteinGraphCalculations -> proteinGraphCalculations.closeness(residueIdentifier)).average().getAsDouble()) + "," + StandardFormat.format(fullPlipGraphCalculations.stream().mapToDouble(proteinGraphCalculations -> proteinGraphCalculations.clusteringCoefficient(residueIdentifier)).average().getAsDouble()) + "," + StandardFormat.format(hydrogenPlipGraphs.stream().mapToInt(proteinGraph -> proteinGraph.getContactsOf(residueIdentifier).size()).average().getAsDouble()) + "," + StandardFormat.format(hydrogenPlipGraphs.stream().mapToInt(proteinGraph -> proteinGraph.getLocalContactsOf(residueIdentifier).size()).average().getAsDouble()) + "," + StandardFormat.format(hydrogenPlipGraphs.stream().mapToInt(proteinGraph -> proteinGraph.getNonLocalContactsOf(residueIdentifier).size()).average().getAsDouble()) + "," + StandardFormat.format(hydrogenPlipGraphCalculations.stream().mapToDouble(proteinGraphCalculations -> proteinGraphCalculations.betweenness(residueIdentifier)).average().getAsDouble()) + "," + StandardFormat.format(hydrogenPlipGraphCalculations.stream().mapToDouble(proteinGraphCalculations -> proteinGraphCalculations.closeness(residueIdentifier)).average().getAsDouble()) + "," + StandardFormat.format(hydrogenPlipGraphCalculations.stream().mapToDouble(proteinGraphCalculations -> proteinGraphCalculations.clusteringCoefficient(residueIdentifier)).average().getAsDouble()) + "," + StandardFormat.format(hydrophobicPlipGraphs.stream().mapToInt(proteinGraph -> proteinGraph.getContactsOf(residueIdentifier).size()).average().getAsDouble()) + "," + StandardFormat.format(hydrophobicPlipGraphs.stream().mapToInt(proteinGraph -> proteinGraph.getLocalContactsOf(residueIdentifier).size()).average().getAsDouble()) + "," + StandardFormat.format(hydrophobicPlipGraphs.stream().mapToInt(proteinGraph -> proteinGraph.getNonLocalContactsOf(residueIdentifier).size()).average().getAsDouble()) + "," + StandardFormat.format(hydrophobicPlipGraphCalculations.stream().mapToDouble(proteinGraphCalculations -> proteinGraphCalculations.betweenness(residueIdentifier)).average().getAsDouble()) + "," + StandardFormat.format(hydrophobicPlipGraphCalculations.stream().mapToDouble(proteinGraphCalculations -> proteinGraphCalculations.closeness(residueIdentifier)).average().getAsDouble()) + "," + StandardFormat.format(hydrophobicPlipGraphCalculations.stream().mapToDouble(proteinGraphCalculations -> proteinGraphCalculations.clusteringCoefficient(residueIdentifier)).average().getAsDouble()) + "," + StandardFormat.format(convGraphs.stream().mapToInt(proteinGraph -> proteinGraph.getContactsOf(residueIdentifier).size()).average().getAsDouble()) + "," + StandardFormat.format(convGraphs.stream().mapToInt(proteinGraph -> proteinGraph.getLocalContactsOf(residueIdentifier).size()).average().getAsDouble()) + "," + StandardFormat.format(convGraphs.stream().mapToInt(proteinGraph -> proteinGraph.getNonLocalContactsOf(residueIdentifier).size()).average().getAsDouble()) + "," + StandardFormat.format(convGraphCalculations.stream().mapToDouble(proteinGraphCalculations -> proteinGraphCalculations.betweenness(residueIdentifier)).average().getAsDouble()) + "," + StandardFormat.format(convGraphCalculations.stream().mapToDouble(proteinGraphCalculations -> proteinGraphCalculations.closeness(residueIdentifier)).average().getAsDouble()) + "," + StandardFormat.format(convGraphCalculations.stream().mapToDouble(proteinGraphCalculations -> proteinGraphCalculations.clusteringCoefficient(residueIdentifier)).average().getAsDouble()) + "," + StandardFormat.format(fullPlipGraphCalculations.stream().mapToInt(proteinGraphCalculations -> proteinGraphCalculations.distinctNeighborhoodCount(residueIdentifier)).average().getAsDouble()) + "," + StandardFormat.format(convGraphCalculations.stream().mapToInt(proteinGraphCalculations -> proteinGraphCalculations.distinctNeighborhoodCount(residueIdentifier)).average().getAsDouble()) + "," + StandardFormat.format(reconstructedChains.stream().map(chain -> chain.select().residueIdentifier(aminoAcid.getResidueIdentifier()).asAminoAcid()).mapToDouble(aa -> aa.getFeature(EnergyProfile.class).getSolvationEnergy()).average().getAsDouble()) + "," + StandardFormat.format(reconstructedChains.stream().map(chain -> chain.select().residueIdentifier(aminoAcid.getResidueIdentifier()).asAminoAcid()).mapToDouble(aa -> aa.getFeature(AccessibleSurfaceArea.class).getRelativeAccessibleSurfaceArea()).average().getAsDouble()) + "," + StandardFormat.format(reconstructedChains.stream().map(chain -> chain.select().residueIdentifier(aminoAcid.getResidueIdentifier()).asAminoAcid()).mapToDouble(aa -> aa.getFeature(LoopFraction.class).getLoopFraction()).average().getAsDouble()) + "," + (earlyFoldingResidues.contains(aminoAcid) ? "early" : "late") + "," + "transition";
}).collect(Collectors.joining(System.lineSeparator())));
} catch (Exception e) {
e.printStackTrace();
logger.info("calculation failed for {}", line, e);
return Optional.empty();
}
}
use of de.bioforscher.jstructure.model.structure.Structure in project jstructure by JonStargaryen.
the class A01_ReportGeneralStatistics method handleEFRLine.
private static void handleEFRLine(String line) {
String[] split = line.split(";");
String entryId = split[0];
String pdbId = split[1];
List<Integer> experimentIds = Pattern.compile(",").splitAsStream(split[2].replaceAll("\\[", "").replaceAll("]", "")).map(Integer::valueOf).collect(Collectors.toList());
int numberOfEarlyFoldingResidues = Integer.valueOf(split[3]);
Structure structure = StructureParser.fromPdbId(pdbId).parse();
Chain chain = structure.chains().findFirst().get();
Start2FoldXmlParser.parseSpecificExperiment(chain, Start2FoldConstants.XML_DIRECTORY.resolve(entryId + ".xml"), experimentIds);
List<AminoAcid> earlyFoldingResidues = chain.aminoAcids().filter(aminoAcid -> aminoAcid.getFeature(Start2FoldResidueAnnotation.class).isEarly()).collect(Collectors.toList());
List<AminoAcid> lateFoldingResidues = chain.aminoAcids().filter(aminoAcid -> !earlyFoldingResidues.contains(aminoAcid)).collect(Collectors.toList());
early.add(earlyFoldingResidues.size());
late.add((int) (chain.aminoAcids().count() - earlyFoldingResidues.size()));
if (earlyFoldingResidues.size() != numberOfEarlyFoldingResidues) {
System.err.println("number of EFR did not match expectation for " + entryId + ": " + earlyFoldingResidues.size() + " vs " + numberOfEarlyFoldingResidues);
}
String uniProtId = split[4];
List<Integer> functionalResidueNumbers = Start2FoldConstants.extractFunctioanlResidueNumbers(split);
List<AminoAcid> functionalResidues = new ArrayList<>();
// do nothing if no annotation of functional residues exists
if (!functionalResidueNumbers.isEmpty()) {
FunctionalResidueParser.parse(chain, functionalResidueNumbers);
chain.aminoAcids().filter(aminoAcid -> aminoAcid.getFeature(FunctionalResidueAnnotation.class).isFunctional()).forEach(functionalResidues::add);
}
List<AminoAcid> nonFunctionalResidues = chain.aminoAcids().filter(aminoAcid -> !functionalResidues.contains(aminoAcid)).collect(Collectors.toList());
List<AminoAcid> exposedAminoAcids = chain.aminoAcids().filter(aminoAcid -> aminoAcid.getFeature(AccessibleSurfaceArea.class).isExposed()).collect(Collectors.toList());
List<AminoAcid> buriedAminoAcids = chain.aminoAcids().filter(aminoAcid -> aminoAcid.getFeature(AccessibleSurfaceArea.class).isBuried()).collect(Collectors.toList());
rasaContingencyTable[0] += SetOperations.createIntersectionSet(earlyFoldingResidues, buriedAminoAcids).size();
rasaContingencyTable[1] += SetOperations.createIntersectionSet(earlyFoldingResidues, exposedAminoAcids).size();
rasaContingencyTable[2] += SetOperations.createIntersectionSet(lateFoldingResidues, buriedAminoAcids).size();
rasaContingencyTable[3] += SetOperations.createIntersectionSet(lateFoldingResidues, exposedAminoAcids).size();
int earlyFunctionalCount = 0;
if (!functionalResidues.isEmpty()) {
functional.add(functionalResidues.size());
nonFunctional.add((int) chain.aminoAcids().count() - functionalResidues.size());
earlyFunctionalCount = SetOperations.createIntersectionSet(earlyFoldingResidues, functionalResidues).size();
overlap.add(earlyFunctionalCount);
int ef = earlyFunctionalCount;
int en = SetOperations.createIntersectionSet(earlyFoldingResidues, nonFunctionalResidues).size();
int lf = SetOperations.createIntersectionSet(lateFoldingResidues, functionalResidues).size();
int ln = SetOperations.createIntersectionSet(lateFoldingResidues, nonFunctionalResidues).size();
contingencyTable[0] += ef;
contingencyTable[1] += en;
contingencyTable[2] += lf;
contingencyTable[3] += ln;
double[] test = FishersExactTest.fishersExactTest(ef, en, lf, ln);
System.out.println("values: " + ef + ", " + en + ", " + lf + ", " + ln);
System.out.println("test: " + Arrays.toString(test));
functionalTableLines.add(entryId + " & " + chain.aminoAcids().count() + " & " + earlyFoldingResidues.size() + " & " + functionalResidues.size() + " & " + ef + " & " + StandardFormat.format(test[0]) + " & " + "? \\\\");
}
tableLines.add(entryId + " & " + pdbId + "\\_A & " + uniProtId + " & " + chain.aminoAcids().count() + " & " + earlyFoldingResidues.size() + " & " + (functionalResidues.isEmpty() ? "-" : functionalResidues.size()) + " & " + (functionalResidues.isEmpty() ? "-" : earlyFunctionalCount) + " \\\\");
}
use of de.bioforscher.jstructure.model.structure.Structure in project jstructure by JonStargaryen.
the class A01_WriteEarlyFoldingClassificationArff method handleLine.
private static Optional<String> handleLine(String line) {
try {
logger.info("handling {}", line);
String[] split = line.split(";");
String entryId = split[0];
String pdbId = split[1];
List<Integer> experimentIds = Pattern.compile(",").splitAsStream(split[2].replaceAll("\\[", "").replaceAll("]", "")).map(Integer::valueOf).collect(Collectors.toList());
Structure structure = StructureParser.fromPdbId(pdbId).parse();
Chain chain = structure.getFirstChain();
Start2FoldXmlParser.parseSpecificExperiment(chain, Start2FoldConstants.XML_DIRECTORY.resolve(entryId + ".xml"), experimentIds);
List<AminoAcid> earlyFoldingResidues = chain.aminoAcids().filter(aminoAcid -> aminoAcid.getFeature(Start2FoldResidueAnnotation.class).isEarly()).collect(Collectors.toList());
List<AminoAcid> aminoAcids = chain.aminoAcids().collect(Collectors.toList());
aminoAcids.forEach(RawFeatureVector::assignRawFeatureVector);
// smooth features
aminoAcids.forEach(aminoAcid -> SmoothedFeatureVector.assignSmoothedFeatureVector(aminoAcids, aminoAcid));
return Optional.of(aminoAcids.stream().filter(aminoAcid -> !(aminoAcid instanceof Proline)).map(aminoAcid -> {
SmoothedFeatureVector smoothedFeatureVector = aminoAcid.getFeature(SmoothedFeatureVector.class);
return StandardFormat.format(smoothedFeatureVector.getEnergy()) + "," + StandardFormat.format(smoothedFeatureVector.getEgor()) + "," + StandardFormat.format(smoothedFeatureVector.getSecondaryStructureElementSize()) + "," + StandardFormat.format(aminoAcid.getFeature(LoopFraction.class).getLoopFraction()) + // already smoothed
"," + StandardFormat.format(smoothedFeatureVector.getRasa()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalInteractions()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalHydrogen()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalHydrophobic()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalBackbone()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalInteractions()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalHydrogen()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalHydrophobic()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalBackbone()) + "," + StandardFormat.format(smoothedFeatureVector.getBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrogenBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrogenCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrogenClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrophobicBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrophobicCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrophobicClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getConvBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getConvCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getConvClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getDistinctNeighborhoods()) + "," + StandardFormat.format(smoothedFeatureVector.getConvDistinctNeighborhoods()) + "," + (earlyFoldingResidues.contains(aminoAcid) ? "early" : "late");
}).collect(Collectors.joining(System.lineSeparator())));
} catch (Exception e) {
logger.warn("computation for {} failed", line, e);
return Optional.empty();
}
}
use of de.bioforscher.jstructure.model.structure.Structure in project jstructure by JonStargaryen.
the class Start2FoldXmlParserTest method shouldParseAllFiles.
@Test
@Ignore
public void shouldParseAllFiles() {
// shows that experiments may have different sequences - need to handle every one individually
Path directory = Start2FoldConstants.XML_DIRECTORY;
Start2FoldConstants.list(directory).forEach(path -> {
try {
logger.info("handling {}", path);
// safe are: STF0005, STF0021
String pdbId = Jsoup.parse(path.toFile(), "UTF-8").getElementsByTag("protein").attr("pdb_id");
Structure structure = StructureParser.fromPdbId(pdbId).parse();
Chain chain = structure.chains().findFirst().get();
Start2FoldXmlParser.parse(chain, path);
} catch (Exception e) {
logger.warn("inspect:", e);
}
});
}
use of de.bioforscher.jstructure.model.structure.Structure in project jstructure by JonStargaryen.
the class Start2FoldXmlParserTest method shouldParseStart2FoldXml.
@Test
public void shouldParseStart2FoldXml() {
Structure structure = StructureParser.fromPdbId("1hrh").parse();
Chain chain = structure.chains().findFirst().get();
Start2FoldXmlParser.parse(chain, TestUtils.getResourceAsInputStream("STF0026.xml"));
}
Aggregations