use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class Metalnteractions method main.
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfFullPath();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(Metalnteractions.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// input parameters
int sequenceIdentityCutoff = 30;
double resolution = 2.5;
int minInteractions = 4;
int maxInteractions = 6;
double distanceCutoff = 3.0;
// chemical component codes of metals in different oxidation states
String[] metals = { "V", "CR", "MN", "MN3", "FE", "FE2", "CO", "3CO", "NI", "3NI", "CU", "CU1", "CU3", "ZN", "MO", "4MO", "6MO" };
// read PDB and create a non-redundant PISCES subset
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).filter(new Pisces(sequenceIdentityCutoff, resolution));
// Setup criteria for metal interactions
InteractionFilter filter = new InteractionFilter();
filter.setDistanceCutoff(distanceCutoff);
filter.setMinInteractions(minInteractions);
filter.setMaxInteractions(maxInteractions);
filter.setQueryGroups(true, metals);
// exclude non-polar interactions
filter.setTargetElements(false, "H", "C", "P");
// tabulate interactions in a dataframe
Dataset<Row> interactions = GroupInteractionExtractor.getInteractions(pdb, filter).cache();
System.out.println("Metal interactions: " + interactions.count());
// select interacting atoms and orientational order parameters (q4 - q6)
// see {@link CoordinationGeometry}
interactions = interactions.select("pdbId", "q4", "q5", "q6", "element0", "groupNum0", "chain0", "element1", "groupNum1", "chain1", "distance1", "element2", "groupNum2", "chain2", "distance2", "element3", "groupNum3", "chain3", "distance3", "element4", "groupNum4", "chain4", "distance4", "element5", "groupNum5", "chain5", "distance5", "element6", "groupNum6", "chain6", "distance6").cache();
// show some example interactions
interactions.dropDuplicates("pdbId").show(10);
System.out.println("Unique interactions by metal:");
interactions.groupBy("element0").count().sort("count").show();
sc.close();
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class MmtfImporter method getFromPdbString.
/**
* Reads a PDB-formatted String.
*
* @param pdbString a PDB-formatted String
* @param structureId the structure identifier
* @return structure data
* @throws IOException
*/
public static StructureDataInterface getFromPdbString(String pdbString, String structureId) throws IOException {
StructureDataInterface structure = null;
InputStream is = new ByteArrayInputStream(pdbString.getBytes());
structure = toStructureDataInterface(is, structureId);
is.close();
return structure;
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class MmtfImporter method importMmcifFiles.
/**
* Reads uncompressed and compressed mmCIF files recursively from a given
* directory path. This methods reads files with the .cif or .cif.gz
* extension.
*
* @param path
* Path to .cif files
* @param sc
* Spark context
* @return structure data as keyword/value pairs
*/
public static JavaPairRDD<String, StructureDataInterface> importMmcifFiles(String path, JavaSparkContext sc) {
FileParsingParameters params = new FileParsingParameters();
params.setCreateAtomBonds(true);
return sc.parallelize(getFiles(path)).mapToPair(new PairFunction<File, String, StructureDataInterface>() {
private static final long serialVersionUID = -7815663658405168429L;
public Tuple2<String, StructureDataInterface> call(File f) throws Exception {
InputStream is = null;
String path = f.getName();
// TODO debugging
System.out.println(path);
if (path.endsWith(".cif") || path.endsWith((".cif.gz"))) {
try {
is = new FileInputStream(f);
if (path.endsWith(".cif.gz")) {
is = new GZIPInputStream(is);
}
// parse .cif file
MMCIFFileReader mmcifReader = new MMCIFFileReader();
mmcifReader.setFileParsingParameters(params);
Structure struc = mmcifReader.getStructure(is);
is.close();
// convert to mmtf
AdapterToStructureData writerToEncoder = new AdapterToStructureData();
new MmtfStructureWriter(struc, writerToEncoder);
return new Tuple2<String, StructureDataInterface>(path.substring(0, path.indexOf(".cif")), writerToEncoder);
} catch (Exception e) {
System.out.println("WARNING: cannot parse: " + path + ". Skipping this entry!");
return null;
}
} else {
return null;
}
}
}).filter(t -> t != null);
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class PdbSequenceToWord2Vec method main.
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 1) {
System.err.println("Usage: " + PdbSequenceToWord2Vec.class.getSimpleName() + " <outputFileName>");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureWord2VecEncoder.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant Pisces
// subset set (<=40% seq. identity) of L-protein chains
int sequenceIdentity = 40;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
Dataset<Row> data = PolymerSequenceExtractor.getDataset(pdb);
data.show(10, false);
// length of polymer sequence segment (number of residues)
int segmentLength = 11;
// add Word2Vec encoded feature vector
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
// size of n-grams
int n = 2;
int windowSize = (segmentLength - 1) / 2;
// dimension of vector
int vectorSize = 50;
data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);
encoder.getWord2VecModel().save(args[0]);
long end = System.nanoTime();
System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureBlosum62Encoder method main.
/**
* @param args args[0] outputFilePath, args[1] outputFormat (json|parquet)
* @throws IOException
* @throws StructureException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 2) {
System.err.println("Usage: " + SecondaryStructureBlosum62Encoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureBlosum62Encoder.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant Pisces
// subset set (<=20% seq. identity) of L-protein chains
int sequenceIdentity = 20;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
int segmentLength = 11;
Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache();
System.out.println("original data : " + data.count());
data = data.dropDuplicates("labelQ3", "sequence").cache();
System.out.println("- duplicate Q3/seq: " + data.count());
data = data.dropDuplicates("sequence").cache();
System.out.println("- duplicate seq : " + data.count());
// add a property encoded feature vector
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
data = encoder.blosum62Encode();
data.printSchema();
data.show(25, false);
if (args[1].equals("json")) {
// coalesce data into a single file
data = data.coalesce(1);
}
data.write().mode("overwrite").format(args[1]).save(args[0]);
long end = System.nanoTime();
System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
Aggregations