use of de.unijena.bioinf.chemdb.ChemicalDatabase in project sirius by boecker-lab.
the class FingerprintVisualization method main.
public static void main(String[] args) throws IOException {
final PrintStream stream = new PrintStream("feature_examples.tsv");
final CdkFingerprintVersion version = CdkFingerprintVersion.getComplete();
final List<String> lines = Files.readAllLines(new File("summary.csv").toPath(), Charset.forName("UTF-8"));
final int[] atomSizes = new int[version.size()];
Arrays.fill(atomSizes, -1);
try (final ChemicalDatabase db = new ChemicalDatabase()) {
final SMARTSQueryTool query = new SMARTSQueryTool("C-C", SilentChemObjectBuilder.getInstance());
query.setQueryCacheSize(atomSizes.length);
final ECFPFingerprinter fingerprinter = new ECFPFingerprinter();
final int ECFP_OFFSET = version.getOffsetFor(CdkFingerprintVersion.USED_FINGERPRINTS.ECFP);
final String[] substructures = new String[fingerprinter.getSize()];
final String[] smarts = new String[fingerprinter.getSize()];
final List<String> compounds = Files.readAllLines(new File("compounds.csv").toPath(), Charset.forName("UTF-8"));
final InChIGeneratorFactory igf = InChIGeneratorFactory.getInstance();
for (String aline : compounds) {
final String[] linetabs = aline.split("\t");
final String inchi = linetabs[2];
final String inchikey = linetabs[1];
final String smiles = linetabs[3];
final IAtomContainer molecule = new SmilesParser(SilentChemObjectBuilder.getInstance()).parseSmiles(smiles);
final int[][] matrix = AdjacencyMatrix.getMatrix(molecule);
final IBitFingerprint fp = fingerprinter.getBitFingerprint(molecule);
final CircularFingerprinter.FP[] details = fingerprinter.getFingerprintDetails();
int k = 0;
for (int i = 0; i < fp.size(); ++i) {
if (fp.get(i)) {
boolean aroma = false;
final CircularFingerprinter.FP x = details[i];
final TIntArrayList inds = new TIntArrayList();
final BitSet ids = new BitSet(molecule.getAtomCount());
for (int atomi : x.atoms) {
if (ids.get(atomi))
continue;
final IAtom atom = molecule.getAtom(atomi);
aroma = aroma || atom.isAromatic();
inds.add(atomi);
ids.set(atomi);
addAromaticRings(molecule, matrix, atomi, atom, ids, inds);
}
if (substructures[i] == null || !aroma) {
final String asmiles = SmilesGenerator.absolute().aromatic().create(AtomContainerManipulator.extractSubstructure(molecule, inds.toArray()));
substructures[i] = asmiles;
atomSizes[i + ECFP_OFFSET] = x.atoms.length;
addNeighbours(molecule, matrix, ids, inds);
final String smart = SmilesGenerator.absolute().aromatic().create(AtomContainerManipulator.extractSubstructure(molecule, inds.toArray()));
smarts[i] = smart;
}
}
}
Fingerprint fingerprint = db.lookupFingerprintByInChI(InChIs.newInChI(inchikey, inchi));
if (fingerprint == null) {
// compute it...
final Fingerprinter fingerprinter1 = Fingerprinter.getFor(CdkFingerprintVersion.getComplete());
fingerprint = new FixedFingerprinter(CdkFingerprintVersion.getDefault()).computeFingerprintFromSMILES(smiles);
}
if (fingerprint != null) {
for (FPIter f : fingerprint) {
if (f.isSet() && atomSizes[f.getIndex()] < 0) {
if (f.getMolecularProperty() instanceof SubstructureProperty) {
query.setSmarts(((SubstructureProperty) f.getMolecularProperty()).getSmarts());
if (query.matches(molecule)) {
final List<List<Integer>> size = query.getUniqueMatchingAtoms();
if (!size.isEmpty()) {
atomSizes[f.getIndex()] = Math.max(atomSizes[f.getIndex()], size.get(0).size());
}
}
}
}
}
}
}
final CdkFingerprintVersion v = CdkFingerprintVersion.getComplete();
final SMARTSQueryTool queryTool = new SMARTSQueryTool("C#C", SilentChemObjectBuilder.getInstance());
queryTool.setQueryCacheSize(6000);
for (String line : lines) {
String[] tabs = line.split("\t");
final MolecularProperty prop = v.getMolecularProperty(Integer.parseInt(tabs[0]));
stream.print(tabs[0]);
stream.print("\t");
stream.print(atomSizes[Integer.parseInt(tabs[0])]);
if (tabs.length == 2) {
stream.print("\t" + tabs[1]);
} else {
for (int i = 2; i < tabs.length; i += 2) {
final String substruct = tabs[i];
try {
final IAtomContainer substructure = new SmilesParser(SilentChemObjectBuilder.getInstance()).parseSmiles(substruct);
if (prop instanceof SubstructureProperty) {
queryTool.setSmarts(((SubstructureProperty) prop).getSmarts());
if (queryTool.matches(substructure)) {
List<List<Integer>> match = queryTool.getUniqueMatchingAtoms();
if (match.size() > 0 && match.get(0).size() > 0) {
stream.print("\t" + tabs[i]);
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
stream.print("\n");
}
final int offset = version.getOffsetFor(CdkFingerprintVersion.USED_FINGERPRINTS.ECFP);
for (int i = 0; i < substructures.length; ++i) {
if (substructures[i] == null) {
System.err.println("Warning: No smiles for ECFP with index " + i + " and hash " + version.getMolecularProperty(offset + i).getDescription());
} else {
stream.print(offset + i);
stream.print("\t");
stream.print(atomSizes[i + offset]);
stream.print("\t");
stream.print(substructures[i]);
stream.print("\t");
stream.println(smarts[i]);
}
}
} catch (CDKException e) {
e.printStackTrace();
} catch (CloneNotSupportedException e) {
e.printStackTrace();
} catch (ChemicalDatabaseException e) {
e.printStackTrace();
}
stream.close();
// now add ECFP fingerprints
}
Aggregations