use of de.bioforscher.start2fold.model.vector.SmoothedFeatureVector in project jstructure by JonStargaryen.
the class A01_WriteEarlyFoldingClassificationArff method handleLine.
private static Optional<String> handleLine(String line) {
try {
logger.info("handling {}", line);
String[] split = line.split(";");
String entryId = split[0];
String pdbId = split[1];
List<Integer> experimentIds = Pattern.compile(",").splitAsStream(split[2].replaceAll("\\[", "").replaceAll("]", "")).map(Integer::valueOf).collect(Collectors.toList());
Structure structure = StructureParser.fromPdbId(pdbId).parse();
Chain chain = structure.getFirstChain();
Start2FoldXmlParser.parseSpecificExperiment(chain, Start2FoldConstants.XML_DIRECTORY.resolve(entryId + ".xml"), experimentIds);
List<AminoAcid> earlyFoldingResidues = chain.aminoAcids().filter(aminoAcid -> aminoAcid.getFeature(Start2FoldResidueAnnotation.class).isEarly()).collect(Collectors.toList());
List<AminoAcid> aminoAcids = chain.aminoAcids().collect(Collectors.toList());
aminoAcids.forEach(RawFeatureVector::assignRawFeatureVector);
// smooth features
aminoAcids.forEach(aminoAcid -> SmoothedFeatureVector.assignSmoothedFeatureVector(aminoAcids, aminoAcid));
return Optional.of(aminoAcids.stream().filter(aminoAcid -> !(aminoAcid instanceof Proline)).map(aminoAcid -> {
SmoothedFeatureVector smoothedFeatureVector = aminoAcid.getFeature(SmoothedFeatureVector.class);
return StandardFormat.format(smoothedFeatureVector.getEnergy()) + "," + StandardFormat.format(smoothedFeatureVector.getEgor()) + "," + StandardFormat.format(smoothedFeatureVector.getSecondaryStructureElementSize()) + "," + StandardFormat.format(aminoAcid.getFeature(LoopFraction.class).getLoopFraction()) + // already smoothed
"," + StandardFormat.format(smoothedFeatureVector.getRasa()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalInteractions()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalHydrogen()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalHydrophobic()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalBackbone()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalInteractions()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalHydrogen()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalHydrophobic()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalBackbone()) + "," + StandardFormat.format(smoothedFeatureVector.getBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrogenBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrogenCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrogenClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrophobicBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrophobicCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrophobicClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getConvBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getConvCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getConvClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getDistinctNeighborhoods()) + "," + StandardFormat.format(smoothedFeatureVector.getConvDistinctNeighborhoods()) + "," + (earlyFoldingResidues.contains(aminoAcid) ? "early" : "late");
}).collect(Collectors.joining(System.lineSeparator())));
} catch (Exception e) {
logger.warn("computation for {} failed", line, e);
return Optional.empty();
}
}
use of de.bioforscher.start2fold.model.vector.SmoothedFeatureVector in project jstructure by JonStargaryen.
the class EarlyFoldingClassificationRunner method process.
public void process(Structure structure, Path outputPath) throws IOException {
// report structure characteristics
System.out.println("structure: " + structure.getProteinIdentifier().getFullName() + "\n" + "chains: " + structure.chainsWithAminoAcids().map(Chain::getChainIdentifier).map(ChainIdentifier::getChainId).collect(Collectors.toList()) + "\n" + "total residues: " + structure.aminoAcids().count());
System.out.println();
// compute features
System.out.println("computing residue-level features");
// start with PLIP to fail fast
System.out.println("querying PLIP-REST-Service");
try {
// try to annotate by standard routine
PLIP_INTRA_MOLECULAR_ANNOTATOR.process(structure);
System.out.println("fetched PLIP contacts");
} catch (Exception e1) {
try {
// potential non-pdb-entry, try to compute on-the-fly
structure.chainsWithAminoAcids().forEach(chain -> {
Document document = PLIPRestServiceQuery.calculateIntraChainDocument(chain);
PLIP_INTRA_MOLECULAR_ANNOTATOR.process(chain, document);
});
System.out.println("computed PLIP contacts");
} catch (Exception e2) {
System.out.println("failed: could not compute PLIP contacts");
e2.printStackTrace();
return;
}
}
System.out.println("computing energy profiles");
EGOR_AGREEMENT_CALCULATOR.process(structure);
System.out.println("annotating secondary structure elements");
LOOP_FRACTION_CALCULATOR.process(structure);
System.out.println("computing relative accessible surface area");
ACCESSIBLE_SURFACE_AREA_CALCULATOR.process(structure);
// assign feature vectors
structure.aminoAcids().forEach(RawFeatureVector::assignRawFeatureVector);
// smooth feature vectors
structure.chainsWithAminoAcids().forEach(chain -> {
List<AminoAcid> aminoAcids = chain.aminoAcids().collect(Collectors.toList());
aminoAcids.forEach(aminoAcid -> {
SmoothedFeatureVector.assignSmoothedFeatureVector(aminoAcids, aminoAcid);
});
});
// classify each residue
StringJoiner outputJoiner = new StringJoiner(System.lineSeparator());
// print header
outputJoiner.add("structure: '" + structure.getProteinIdentifier().getFullName() + "'").add("chains: " + structure.chainsWithAminoAcids().map(Chain::getChainIdentifier).map(ChainIdentifier::getChainId).collect(Collectors.toList())).add("total residues: " + structure.aminoAcids().count()).add("chain,res,aa,sse,energy,egor,sse_size,loop_fraction,rasa,plip_local_contacts," + "plip_local_hbonds,plip_local_hydrophobic,plip_local_backbone,plip_long_range_contacts," + "plip_long_range_hbonds,plip_long_range_hydrophobic,plip_long_range_backbone," + "plip_betweenness,plip_closeness,plip_clusteringcoefficient,plip_hbonds_betweenness," + "plip_hbonds_closeness,plip_hbonds_clusteringcoefficient,plip_hydrophobic_betweenness," + "plip_hydrophobic_closeness,plip_hydrophobic_clusteringcoefficient,conv_betweenness," + "conv_closeness,conv_clusteringcoefficient,plip_neighborhoods,conv_neighborhoods,prob,folds");
structure.chainsWithAminoAcids().forEach(chain -> {
List<String> output = structure.aminoAcids().map(aminoAcid -> {
boolean isProline = aminoAcid instanceof Proline;
SmoothedFeatureVector smoothedFeatureVector = aminoAcid.getFeature(SmoothedFeatureVector.class);
double loopFraction = aminoAcid.getFeature(LoopFraction.class).getLoopFraction();
Instance instance = createInstance(smoothedFeatureVector, loopFraction);
double prob = 0.0;
if (!isProline) {
try {
prob = model.distributionForInstance(normalize(instance))[0];
} catch (Exception e) {
e.printStackTrace();
}
}
StringJoiner lineJoiner = new StringJoiner(",");
lineJoiner.add(aminoAcid.getParentChain().getChainIdentifier().getChainId()).add(aminoAcid.getResidueIdentifier().toString()).add(aminoAcid.getOneLetterCode()).add(aminoAcid.getFeature(GenericSecondaryStructure.class).getSecondaryStructure().getReducedRepresentation());
for (int i = 0; i < instance.numAttributes() - 1; i++) {
lineJoiner.add(StandardFormat.format(instance.value(i)));
}
lineJoiner.add(StandardFormat.format(prob));
return lineJoiner.toString();
}).sorted(Comparator.comparingDouble((String line) -> Double.valueOf(line.split(",")[line.split(",").length - 1])).reversed()).collect(Collectors.toList());
int numberOfEarlyFoldingResidues = (int) (0.15 * (int) chain.aminoAcids().count());
int counter = 0;
for (int i = 0; i < chain.aminoAcids().count(); i++) {
outputJoiner.add(output.get(i) + "," + (counter < numberOfEarlyFoldingResidues ? "early" : "late"));
counter++;
}
});
// write output
System.out.println("writing output to " + outputPath);
Files.write(outputPath, outputJoiner.toString().getBytes());
}
use of de.bioforscher.start2fold.model.vector.SmoothedFeatureVector in project jstructure by JonStargaryen.
the class A03_WriteFunctionalClassificationArff method handleLine.
private static Optional<String> handleLine(String line) {
try {
logger.info("handling {}", line);
String[] split = line.split(";");
String pdbId = split[1];
Structure structure = StructureParser.fromPdbId(pdbId).parse();
Chain chain = structure.getFirstChain();
List<AminoAcid> aminoAcids = chain.aminoAcids().collect(Collectors.toList());
aminoAcids.forEach(RawFeatureVector::assignRawFeatureVector);
List<Integer> functionalResidueNumbers = Start2FoldConstants.extractFunctioanlResidueNumbers(split);
List<AminoAcid> functionalResidues = new ArrayList<>();
// do nothing if no annotation of functional residues exists
if (!functionalResidueNumbers.isEmpty()) {
FunctionalResidueParser.parse(chain, functionalResidueNumbers);
chain.aminoAcids().filter(aminoAcid -> aminoAcid.getFeature(FunctionalResidueAnnotation.class).isFunctional()).forEach(functionalResidues::add);
}
// ignore proteins lacking annotation of functional residues
if (functionalResidues.isEmpty()) {
return Optional.empty();
}
// smooth features
aminoAcids.forEach(aminoAcid -> SmoothedFeatureVector.assignSmoothedFeatureVector(aminoAcids, aminoAcid));
return Optional.of(aminoAcids.stream().filter(aminoAcid -> !(aminoAcid instanceof Proline)).map(aminoAcid -> {
SmoothedFeatureVector smoothedFeatureVector = aminoAcid.getFeature(SmoothedFeatureVector.class);
return StandardFormat.format(smoothedFeatureVector.getEnergy()) + "," + StandardFormat.format(smoothedFeatureVector.getEgor()) + "," + StandardFormat.format(smoothedFeatureVector.getSecondaryStructureElementSize()) + "," + StandardFormat.format(aminoAcid.getFeature(LoopFraction.class).getLoopFraction()) + // already smoothed
"," + StandardFormat.format(smoothedFeatureVector.getRasa()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalInteractions()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalHydrogen()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalHydrophobic()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalBackbone()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalInteractions()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalHydrogen()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalHydrophobic()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalBackbone()) + "," + StandardFormat.format(smoothedFeatureVector.getBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrogenBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrogenCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrogenClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrophobicBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrophobicCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrophobicClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getConvBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getConvCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getConvClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getDistinctNeighborhoods()) + "," + StandardFormat.format(smoothedFeatureVector.getConvDistinctNeighborhoods()) + "," + (functionalResidues.contains(aminoAcid) ? "functional" : "non-functional");
}).collect(Collectors.joining(System.lineSeparator())));
} catch (Exception e) {
logger.warn("computation for {} failed", line, e);
return Optional.empty();
}
}
Aggregations