Search in sources :

Example 1 with RawFeatureVector

use of de.bioforscher.start2fold.model.vector.RawFeatureVector in project jstructure by JonStargaryen.

the class EarlyFoldingClassificationRunner method process.

public void process(Structure structure, Path outputPath) throws IOException {
    // report structure characteristics
    System.out.println("structure: " + structure.getProteinIdentifier().getFullName() + "\n" + "chains: " + structure.chainsWithAminoAcids().map(Chain::getChainIdentifier).map(ChainIdentifier::getChainId).collect(Collectors.toList()) + "\n" + "total residues: " + structure.aminoAcids().count());
    System.out.println();
    // compute features
    System.out.println("computing residue-level features");
    // start with PLIP to fail fast
    System.out.println("querying PLIP-REST-Service");
    try {
        // try to annotate by standard routine
        PLIP_INTRA_MOLECULAR_ANNOTATOR.process(structure);
        System.out.println("fetched PLIP contacts");
    } catch (Exception e1) {
        try {
            // potential non-pdb-entry, try to compute on-the-fly
            structure.chainsWithAminoAcids().forEach(chain -> {
                Document document = PLIPRestServiceQuery.calculateIntraChainDocument(chain);
                PLIP_INTRA_MOLECULAR_ANNOTATOR.process(chain, document);
            });
            System.out.println("computed PLIP contacts");
        } catch (Exception e2) {
            System.out.println("failed: could not compute PLIP contacts");
            e2.printStackTrace();
            return;
        }
    }
    System.out.println("computing energy profiles");
    EGOR_AGREEMENT_CALCULATOR.process(structure);
    System.out.println("annotating secondary structure elements");
    LOOP_FRACTION_CALCULATOR.process(structure);
    System.out.println("computing relative accessible surface area");
    ACCESSIBLE_SURFACE_AREA_CALCULATOR.process(structure);
    // assign feature vectors
    structure.aminoAcids().forEach(RawFeatureVector::assignRawFeatureVector);
    // smooth feature vectors
    structure.chainsWithAminoAcids().forEach(chain -> {
        List<AminoAcid> aminoAcids = chain.aminoAcids().collect(Collectors.toList());
        aminoAcids.forEach(aminoAcid -> {
            SmoothedFeatureVector.assignSmoothedFeatureVector(aminoAcids, aminoAcid);
        });
    });
    // classify each residue
    StringJoiner outputJoiner = new StringJoiner(System.lineSeparator());
    // print header
    outputJoiner.add("structure: '" + structure.getProteinIdentifier().getFullName() + "'").add("chains: " + structure.chainsWithAminoAcids().map(Chain::getChainIdentifier).map(ChainIdentifier::getChainId).collect(Collectors.toList())).add("total residues: " + structure.aminoAcids().count()).add("chain,res,aa,sse,energy,egor,sse_size,loop_fraction,rasa,plip_local_contacts," + "plip_local_hbonds,plip_local_hydrophobic,plip_local_backbone,plip_long_range_contacts," + "plip_long_range_hbonds,plip_long_range_hydrophobic,plip_long_range_backbone," + "plip_betweenness,plip_closeness,plip_clusteringcoefficient,plip_hbonds_betweenness," + "plip_hbonds_closeness,plip_hbonds_clusteringcoefficient,plip_hydrophobic_betweenness," + "plip_hydrophobic_closeness,plip_hydrophobic_clusteringcoefficient,conv_betweenness," + "conv_closeness,conv_clusteringcoefficient,plip_neighborhoods,conv_neighborhoods,prob,folds");
    structure.chainsWithAminoAcids().forEach(chain -> {
        List<String> output = structure.aminoAcids().map(aminoAcid -> {
            boolean isProline = aminoAcid instanceof Proline;
            SmoothedFeatureVector smoothedFeatureVector = aminoAcid.getFeature(SmoothedFeatureVector.class);
            double loopFraction = aminoAcid.getFeature(LoopFraction.class).getLoopFraction();
            Instance instance = createInstance(smoothedFeatureVector, loopFraction);
            double prob = 0.0;
            if (!isProline) {
                try {
                    prob = model.distributionForInstance(normalize(instance))[0];
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
            StringJoiner lineJoiner = new StringJoiner(",");
            lineJoiner.add(aminoAcid.getParentChain().getChainIdentifier().getChainId()).add(aminoAcid.getResidueIdentifier().toString()).add(aminoAcid.getOneLetterCode()).add(aminoAcid.getFeature(GenericSecondaryStructure.class).getSecondaryStructure().getReducedRepresentation());
            for (int i = 0; i < instance.numAttributes() - 1; i++) {
                lineJoiner.add(StandardFormat.format(instance.value(i)));
            }
            lineJoiner.add(StandardFormat.format(prob));
            return lineJoiner.toString();
        }).sorted(Comparator.comparingDouble((String line) -> Double.valueOf(line.split(",")[line.split(",").length - 1])).reversed()).collect(Collectors.toList());
        int numberOfEarlyFoldingResidues = (int) (0.15 * (int) chain.aminoAcids().count());
        int counter = 0;
        for (int i = 0; i < chain.aminoAcids().count(); i++) {
            outputJoiner.add(output.get(i) + "," + (counter < numberOfEarlyFoldingResidues ? "early" : "late"));
            counter++;
        }
    });
    // write output
    System.out.println("writing output to " + outputPath);
    Files.write(outputPath, outputJoiner.toString().getBytes());
}
Also used : java.util(java.util) LoopFraction(de.bioforscher.jstructure.feature.loopfraction.LoopFraction) Structure(de.bioforscher.jstructure.model.structure.Structure) GenericSecondaryStructure(de.bioforscher.jstructure.feature.sse.GenericSecondaryStructure) StructureParser(de.bioforscher.jstructure.model.structure.StructureParser) LoopFractionCalculator(de.bioforscher.jstructure.feature.loopfraction.LoopFractionCalculator) Classifier(weka.classifiers.Classifier) EgorAgreementCalculator(de.bioforscher.jstructure.feature.energyprofile.EgorAgreementCalculator) ProteinIdentifier(de.bioforscher.jstructure.model.identifier.ProteinIdentifier) ChainIdentifier(de.bioforscher.jstructure.model.identifier.ChainIdentifier) AminoAcid(de.bioforscher.jstructure.model.structure.aminoacid.AminoAcid) Chain(de.bioforscher.jstructure.model.structure.Chain) PLIPRestServiceQuery(de.bioforscher.jstructure.feature.interactions.PLIPRestServiceQuery) StandardFormat(de.bioforscher.jstructure.StandardFormat) PLIPIntraMolecularAnnotator(de.bioforscher.jstructure.feature.interactions.PLIPIntraMolecularAnnotator) Path(java.nio.file.Path) RawFeatureVector(de.bioforscher.start2fold.model.vector.RawFeatureVector) Proline(de.bioforscher.jstructure.model.structure.aminoacid.Proline) Files(java.nio.file.Files) AccessibleSurfaceAreaCalculator(de.bioforscher.jstructure.feature.asa.AccessibleSurfaceAreaCalculator) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) IdentifierFactory(de.bioforscher.jstructure.model.identifier.IdentifierFactory) Paths(java.nio.file.Paths) Document(org.jsoup.nodes.Document) weka.core(weka.core) SmoothedFeatureVector(de.bioforscher.start2fold.model.vector.SmoothedFeatureVector) InputStream(java.io.InputStream) Chain(de.bioforscher.jstructure.model.structure.Chain) ChainIdentifier(de.bioforscher.jstructure.model.identifier.ChainIdentifier) AminoAcid(de.bioforscher.jstructure.model.structure.aminoacid.AminoAcid) SmoothedFeatureVector(de.bioforscher.start2fold.model.vector.SmoothedFeatureVector) Proline(de.bioforscher.jstructure.model.structure.aminoacid.Proline) GenericSecondaryStructure(de.bioforscher.jstructure.feature.sse.GenericSecondaryStructure) Document(org.jsoup.nodes.Document) IOException(java.io.IOException) LoopFraction(de.bioforscher.jstructure.feature.loopfraction.LoopFraction) RawFeatureVector(de.bioforscher.start2fold.model.vector.RawFeatureVector)

Example 2 with RawFeatureVector

use of de.bioforscher.start2fold.model.vector.RawFeatureVector in project jstructure by JonStargaryen.

the class A01_WriteEarlyFoldingClassificationArff method handleLine.

private static Optional<String> handleLine(String line) {
    try {
        logger.info("handling {}", line);
        String[] split = line.split(";");
        String entryId = split[0];
        String pdbId = split[1];
        List<Integer> experimentIds = Pattern.compile(",").splitAsStream(split[2].replaceAll("\\[", "").replaceAll("]", "")).map(Integer::valueOf).collect(Collectors.toList());
        Structure structure = StructureParser.fromPdbId(pdbId).parse();
        Chain chain = structure.getFirstChain();
        Start2FoldXmlParser.parseSpecificExperiment(chain, Start2FoldConstants.XML_DIRECTORY.resolve(entryId + ".xml"), experimentIds);
        List<AminoAcid> earlyFoldingResidues = chain.aminoAcids().filter(aminoAcid -> aminoAcid.getFeature(Start2FoldResidueAnnotation.class).isEarly()).collect(Collectors.toList());
        List<AminoAcid> aminoAcids = chain.aminoAcids().collect(Collectors.toList());
        aminoAcids.forEach(RawFeatureVector::assignRawFeatureVector);
        // smooth features
        aminoAcids.forEach(aminoAcid -> SmoothedFeatureVector.assignSmoothedFeatureVector(aminoAcids, aminoAcid));
        return Optional.of(aminoAcids.stream().filter(aminoAcid -> !(aminoAcid instanceof Proline)).map(aminoAcid -> {
            SmoothedFeatureVector smoothedFeatureVector = aminoAcid.getFeature(SmoothedFeatureVector.class);
            return StandardFormat.format(smoothedFeatureVector.getEnergy()) + "," + StandardFormat.format(smoothedFeatureVector.getEgor()) + "," + StandardFormat.format(smoothedFeatureVector.getSecondaryStructureElementSize()) + "," + StandardFormat.format(aminoAcid.getFeature(LoopFraction.class).getLoopFraction()) + // already smoothed
            "," + StandardFormat.format(smoothedFeatureVector.getRasa()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalInteractions()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalHydrogen()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalHydrophobic()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalBackbone()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalInteractions()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalHydrogen()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalHydrophobic()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalBackbone()) + "," + StandardFormat.format(smoothedFeatureVector.getBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrogenBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrogenCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrogenClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrophobicBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrophobicCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrophobicClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getConvBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getConvCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getConvClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getDistinctNeighborhoods()) + "," + StandardFormat.format(smoothedFeatureVector.getConvDistinctNeighborhoods()) + "," + (earlyFoldingResidues.contains(aminoAcid) ? "early" : "late");
        }).collect(Collectors.joining(System.lineSeparator())));
    } catch (Exception e) {
        logger.warn("computation for {} failed", line, e);
        return Optional.empty();
    }
}
Also used : RawFeatureVector(de.bioforscher.start2fold.model.vector.RawFeatureVector) Proline(de.bioforscher.jstructure.model.structure.aminoacid.Proline) Logger(org.slf4j.Logger) LoopFraction(de.bioforscher.jstructure.feature.loopfraction.LoopFraction) Files(java.nio.file.Files) LoggerFactory(org.slf4j.LoggerFactory) Structure(de.bioforscher.jstructure.model.structure.Structure) IOException(java.io.IOException) StructureParser(de.bioforscher.jstructure.model.structure.StructureParser) Collectors(java.util.stream.Collectors) Start2FoldResidueAnnotation(de.bioforscher.start2fold.model.Start2FoldResidueAnnotation) List(java.util.List) Start2FoldConstants(de.bioforscher.start2fold.Start2FoldConstants) AminoAcid(de.bioforscher.jstructure.model.structure.aminoacid.AminoAcid) SmoothedFeatureVector(de.bioforscher.start2fold.model.vector.SmoothedFeatureVector) Start2FoldXmlParser(de.bioforscher.start2fold.parser.Start2FoldXmlParser) Chain(de.bioforscher.jstructure.model.structure.Chain) Optional(java.util.Optional) StandardFormat(de.bioforscher.jstructure.StandardFormat) Pattern(java.util.regex.Pattern) Chain(de.bioforscher.jstructure.model.structure.Chain) AminoAcid(de.bioforscher.jstructure.model.structure.aminoacid.AminoAcid) SmoothedFeatureVector(de.bioforscher.start2fold.model.vector.SmoothedFeatureVector) Start2FoldResidueAnnotation(de.bioforscher.start2fold.model.Start2FoldResidueAnnotation) Proline(de.bioforscher.jstructure.model.structure.aminoacid.Proline) IOException(java.io.IOException) LoopFraction(de.bioforscher.jstructure.feature.loopfraction.LoopFraction) Structure(de.bioforscher.jstructure.model.structure.Structure) RawFeatureVector(de.bioforscher.start2fold.model.vector.RawFeatureVector)

Example 3 with RawFeatureVector

use of de.bioforscher.start2fold.model.vector.RawFeatureVector in project jstructure by JonStargaryen.

the class A03_WriteFunctionalClassificationArff method handleLine.

private static Optional<String> handleLine(String line) {
    try {
        logger.info("handling {}", line);
        String[] split = line.split(";");
        String pdbId = split[1];
        Structure structure = StructureParser.fromPdbId(pdbId).parse();
        Chain chain = structure.getFirstChain();
        List<AminoAcid> aminoAcids = chain.aminoAcids().collect(Collectors.toList());
        aminoAcids.forEach(RawFeatureVector::assignRawFeatureVector);
        List<Integer> functionalResidueNumbers = Start2FoldConstants.extractFunctioanlResidueNumbers(split);
        List<AminoAcid> functionalResidues = new ArrayList<>();
        // do nothing if no annotation of functional residues exists
        if (!functionalResidueNumbers.isEmpty()) {
            FunctionalResidueParser.parse(chain, functionalResidueNumbers);
            chain.aminoAcids().filter(aminoAcid -> aminoAcid.getFeature(FunctionalResidueAnnotation.class).isFunctional()).forEach(functionalResidues::add);
        }
        // ignore proteins lacking annotation of functional residues
        if (functionalResidues.isEmpty()) {
            return Optional.empty();
        }
        // smooth features
        aminoAcids.forEach(aminoAcid -> SmoothedFeatureVector.assignSmoothedFeatureVector(aminoAcids, aminoAcid));
        return Optional.of(aminoAcids.stream().filter(aminoAcid -> !(aminoAcid instanceof Proline)).map(aminoAcid -> {
            SmoothedFeatureVector smoothedFeatureVector = aminoAcid.getFeature(SmoothedFeatureVector.class);
            return StandardFormat.format(smoothedFeatureVector.getEnergy()) + "," + StandardFormat.format(smoothedFeatureVector.getEgor()) + "," + StandardFormat.format(smoothedFeatureVector.getSecondaryStructureElementSize()) + "," + StandardFormat.format(aminoAcid.getFeature(LoopFraction.class).getLoopFraction()) + // already smoothed
            "," + StandardFormat.format(smoothedFeatureVector.getRasa()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalInteractions()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalHydrogen()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalHydrophobic()) + "," + StandardFormat.format(smoothedFeatureVector.getLocalBackbone()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalInteractions()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalHydrogen()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalHydrophobic()) + "," + StandardFormat.format(smoothedFeatureVector.getNonLocalBackbone()) + "," + StandardFormat.format(smoothedFeatureVector.getBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrogenBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrogenCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrogenClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrophobicBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrophobicCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getHydrophobicClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getConvBetweenness()) + "," + StandardFormat.format(smoothedFeatureVector.getConvCloseness()) + "," + StandardFormat.format(smoothedFeatureVector.getConvClusteringCoefficient()) + "," + StandardFormat.format(smoothedFeatureVector.getDistinctNeighborhoods()) + "," + StandardFormat.format(smoothedFeatureVector.getConvDistinctNeighborhoods()) + "," + (functionalResidues.contains(aminoAcid) ? "functional" : "non-functional");
        }).collect(Collectors.joining(System.lineSeparator())));
    } catch (Exception e) {
        logger.warn("computation for {} failed", line, e);
        return Optional.empty();
    }
}
Also used : RawFeatureVector(de.bioforscher.start2fold.model.vector.RawFeatureVector) Proline(de.bioforscher.jstructure.model.structure.aminoacid.Proline) Logger(org.slf4j.Logger) LoopFraction(de.bioforscher.jstructure.feature.loopfraction.LoopFraction) Files(java.nio.file.Files) LoggerFactory(org.slf4j.LoggerFactory) Structure(de.bioforscher.jstructure.model.structure.Structure) IOException(java.io.IOException) StructureParser(de.bioforscher.jstructure.model.structure.StructureParser) Collectors(java.util.stream.Collectors) FunctionalResidueParser(de.bioforscher.start2fold.parser.FunctionalResidueParser) ArrayList(java.util.ArrayList) FunctionalResidueAnnotation(de.bioforscher.start2fold.model.FunctionalResidueAnnotation) List(java.util.List) Start2FoldConstants(de.bioforscher.start2fold.Start2FoldConstants) AminoAcid(de.bioforscher.jstructure.model.structure.aminoacid.AminoAcid) SmoothedFeatureVector(de.bioforscher.start2fold.model.vector.SmoothedFeatureVector) Chain(de.bioforscher.jstructure.model.structure.Chain) Optional(java.util.Optional) StandardFormat(de.bioforscher.jstructure.StandardFormat) Chain(de.bioforscher.jstructure.model.structure.Chain) AminoAcid(de.bioforscher.jstructure.model.structure.aminoacid.AminoAcid) SmoothedFeatureVector(de.bioforscher.start2fold.model.vector.SmoothedFeatureVector) ArrayList(java.util.ArrayList) Proline(de.bioforscher.jstructure.model.structure.aminoacid.Proline) FunctionalResidueAnnotation(de.bioforscher.start2fold.model.FunctionalResidueAnnotation) IOException(java.io.IOException) LoopFraction(de.bioforscher.jstructure.feature.loopfraction.LoopFraction) Structure(de.bioforscher.jstructure.model.structure.Structure) RawFeatureVector(de.bioforscher.start2fold.model.vector.RawFeatureVector)

Aggregations

StandardFormat (de.bioforscher.jstructure.StandardFormat)3 LoopFraction (de.bioforscher.jstructure.feature.loopfraction.LoopFraction)3 Chain (de.bioforscher.jstructure.model.structure.Chain)3 Structure (de.bioforscher.jstructure.model.structure.Structure)3 StructureParser (de.bioforscher.jstructure.model.structure.StructureParser)3 AminoAcid (de.bioforscher.jstructure.model.structure.aminoacid.AminoAcid)3 Proline (de.bioforscher.jstructure.model.structure.aminoacid.Proline)3 RawFeatureVector (de.bioforscher.start2fold.model.vector.RawFeatureVector)3 SmoothedFeatureVector (de.bioforscher.start2fold.model.vector.SmoothedFeatureVector)3 IOException (java.io.IOException)3 Files (java.nio.file.Files)3 Collectors (java.util.stream.Collectors)3 Start2FoldConstants (de.bioforscher.start2fold.Start2FoldConstants)2 List (java.util.List)2 Optional (java.util.Optional)2 Logger (org.slf4j.Logger)2 LoggerFactory (org.slf4j.LoggerFactory)2 AccessibleSurfaceAreaCalculator (de.bioforscher.jstructure.feature.asa.AccessibleSurfaceAreaCalculator)1 EgorAgreementCalculator (de.bioforscher.jstructure.feature.energyprofile.EgorAgreementCalculator)1 PLIPIntraMolecularAnnotator (de.bioforscher.jstructure.feature.interactions.PLIPIntraMolecularAnnotator)1