use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class StructureToBioassembly2 method call.
@Override
public Iterator<Tuple2<String, StructureDataInterface>> call(Tuple2<String, StructureDataInterface> t) throws Exception {
StructureDataInterface structure = t._2;
// Map<Integer, Integer> atomMap = new HashMap<>();
List<Tuple2<String, StructureDataInterface>> resList = new ArrayList<>();
// for each of them, create one structure.
for (int i = 0; i < structure.getNumBioassemblies(); i++) {
int[] chainsPerModel = new int[structure.getNumModels()];
// initiate the bioassembly structure.
AdapterToStructureData bioAssembly = new AdapterToStructureData();
// set the structureID.
String structureId = structure.getStructureId() + "-BioAssembly" + structure.getBioassemblyName(i);
int totAtoms = 0, totBonds = 0, totGroups = 0, totChains = 0, totModels = 0;
int numTrans = structure.getNumTransInBioassembly(i);
totModels = structure.getNumModels();
int[][] bioChainList = new int[numTrans][];
double[][] transMatrix = new double[numTrans][];
// calculate the total data we will use to initialize the structure.
for (int ii = 0; ii < numTrans; ii++) {
bioChainList[ii] = structure.getChainIndexListForTransform(i, ii);
transMatrix[ii] = structure.getMatrixForTransform(i, ii);
for (int j = 0; j < totModels; j++) {
totChains += bioChainList[ii].length;
chainsPerModel[j] += bioChainList[ii].length;
// System.out.println(bioChainList[ii].length + " " + Arrays.toString(bioChainList[ii]));
for (int k = 0, groupCounter = 0; k < structure.getChainsPerModel()[j]; k++) {
boolean adding = false;
for (int currChain : bioChainList[ii]) {
if (currChain == k)
adding = true;
}
if (adding) {
// System.out.println("adding groups");
totGroups += structure.getGroupsPerChain()[k];
}
for (int h = 0; h < structure.getGroupsPerChain()[k]; h++, groupCounter++) {
if (adding) {
int groupIndex = structure.getGroupTypeIndices()[groupCounter];
totAtoms += structure.getNumAtomsInGroup(groupIndex);
totBonds += structure.getGroupBondOrders(groupIndex).length;
}
}
}
}
}
// init
// System.out.println("Initializing the structure with\n"
// + " totModel = " + totModels + ", totChains = " + totChains + ", totGroups = " + totGroups + ", totAtoms = "
// + totAtoms + ", totBonds = " + totBonds + ", name : " + structureId);
bioAssembly.initStructure(totBonds, totAtoms, totGroups, totChains, totModels, structureId);
DecoderUtils.addXtalographicInfo(structure, bioAssembly);
DecoderUtils.addHeaderInfo(structure, bioAssembly);
/*
* Now we have bioChainList and transMatrix.
* bioChainList[i] is the ith trans' list of chains it has.
* transMatrix[i] is the matrix that is going to be applied on those chains.
*/
// initialize the indices.
int modelIndex = 0;
int chainIndex = 0;
int groupIndex = 0;
int atomIndex = 0;
int chainCounter = 0;
// loop through models
for (int ii = 0; ii < structure.getNumModels(); ii++) {
// precalculate indices
// this number is not correct if BA has fewer chains than the AU
// int numChainsPerModel = structure.getChainsPerModel()[modelIndex] * numTrans;
int numChainsPerModel = chainsPerModel[ii];
System.out.println("numChainsPerModel: " + numChainsPerModel);
bioAssembly.setModelInfo(modelIndex, numChainsPerModel);
int[] chainToEntityIndex = getChainToEntityIndex(structure);
// loop through chains
for (int j = 0; j < structure.getChainsPerModel()[modelIndex]; j++) {
// loop through each trans
int currGroupIndex = groupIndex;
int currAtomIndex = atomIndex;
for (int k = 0; k < numTrans; k++) {
// get the currChainList that needs to be added
int[] currChainList = bioChainList[k];
double[] currMatrix = transMatrix[k];
boolean addThisChain = false;
for (int currChain : currChainList) {
if (currChain == j)
addThisChain = true;
}
groupIndex = currGroupIndex;
atomIndex = currAtomIndex;
float[] xCoords = structure.getxCoords();
float[] yCoords = structure.getyCoords();
float[] zCoords = structure.getzCoords();
// float[] floatMatrix = Floats.toArray(Doubles.asList(currMatrix));
// Matrix4f m = new Matrix4f(floatMatrix);
Matrix4d md = new Matrix4d(currMatrix);
if (addThisChain) {
int entityToChainIndex = chainToEntityIndex[chainIndex];
// System.out.println("adding chain : " + chainIndex);
// TODO
// not sure
bioAssembly.setEntityInfo(new int[] { chainCounter }, structure.getEntitySequence(entityToChainIndex), structure.getEntityDescription(entityToChainIndex), structure.getEntityType(entityToChainIndex));
// TODO create unique chain ids
bioAssembly.setChainInfo(structure.getChainIds()[chainIndex] + (k + 1), structure.getChainNames()[chainIndex] + (k + 1), // bioAssembly.setChainInfo(structure.getChainIds()[chainIndex], structure.getChainNames()[chainIndex],
structure.getGroupsPerChain()[chainIndex]);
chainCounter++;
}
// loop through the groups in the chain
for (int jj = 0; jj < structure.getGroupsPerChain()[chainIndex]; jj++) {
int currgroup = structure.getGroupTypeIndices()[groupIndex];
if (addThisChain) {
bioAssembly.setGroupInfo(structure.getGroupName(currgroup), structure.getGroupIds()[groupIndex], structure.getInsCodes()[groupIndex], structure.getGroupChemCompType(currgroup), structure.getNumAtomsInGroup(currgroup), structure.getGroupBondOrders(currgroup).length, structure.getGroupSingleLetterCode(currgroup), structure.getGroupSequenceIndices()[groupIndex], structure.getSecStructList()[groupIndex]);
}
for (int kk = 0; kk < structure.getNumAtomsInGroup(currgroup); kk++) {
// System.out.println("currgroup : " + currgroup + " curratom : " + kk);
if (addThisChain) {
Point3f p1 = new Point3f(xCoords[atomIndex], yCoords[atomIndex], zCoords[atomIndex]);
// m.transform(p1);
md.transform(p1);
// System.out.println(kk + " " + currgroup);
bioAssembly.setAtomInfo(structure.getGroupAtomNames(currgroup)[kk], structure.getAtomIds()[atomIndex], structure.getAltLocIds()[atomIndex], p1.x, p1.y, p1.z, structure.getOccupancies()[atomIndex], structure.getbFactors()[atomIndex], structure.getGroupElementNames(currgroup)[kk], structure.getGroupAtomCharges(currgroup)[kk]);
}
// inc the atomIndex
atomIndex++;
}
if (addThisChain) {
for (int l = 0; l < structure.getGroupBondOrders(currgroup).length; l++) {
// System.out.println(structure.getGroupBondOrders(currgroup).length + " " + l);
int bondIndOne = structure.getGroupBondIndices(currgroup)[l * 2];
int bondIndTwo = structure.getGroupBondIndices(currgroup)[l * 2 + 1];
int bondOrder = structure.getGroupBondOrders(currgroup)[l];
bioAssembly.setGroupBond(bondIndOne, bondIndTwo, bondOrder);
}
}
// inc the groupIndex
groupIndex++;
}
if (addThisChain) {
// Add inter-group bond info
// for(int l = 0; l < structure.getInterGroupBondOrders().length; l++){
// int bondIndOne = structure.getInterGroupBondIndices()[l*2];
// int bondIndTwo = structure.getInterGroupBondIndices()[l*2+1];
// int bondOrder = structure.getInterGroupBondOrders()[l];
// Integer indexOne = atomMap.get(bondIndOne);
// if (indexOne != null) {
// Integer indexTwo = atomMap.get(bondIndTwo);
// if (indexTwo != null) {
// bioAssembly.setInterGroupBond(indexOne, indexTwo, bondOrder);
// }
// }
}
}
// inc the chainIndex
chainIndex++;
}
// inc the modelIndex
modelIndex++;
}
bioAssembly.finalizeStructure();
resList.add(new Tuple2<String, StructureDataInterface>(structureId, bioAssembly));
}
return resList.iterator();
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class ProteinFoldDatasetCreator method main.
/**
* @param args
* @throws IOException
* @throws StructureException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 1) {
System.err.println("Usage: " + ProteinFoldDatasetCreator.class.getSimpleName() + " <dataset output file");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ProteinFoldDatasetCreator.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant Pisces
// subset set (<=20% seq. identity) of L-protein chains
int sequenceIdentity = 20;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
// get secondary structure content
Dataset<Row> data = SecondaryStructureExtractor.getDataset(pdb);
// classify chains by secondary structure type
double minThreshold = 0.05;
double maxThreshold = 0.15;
data = addProteinFoldType(data, minThreshold, maxThreshold);
// create a binary classification dataset
data = data.filter("foldType = 'alpha' OR foldType = 'beta'").cache();
// create a three-state classification model (alpha, beta, alpha+beta)
// data = data.filter("foldType != 'other'").cache();
// add Word2Vec encoded feature vector
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
int n = 2;
int windowSize = 11;
int vectorSize = 50;
data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);
data.printSchema();
data.show(25);
// keep only a subset of relevant fields for further processing
data = data.select("structureChainId", "alpha", "beta", "coil", "foldType", "features");
data.write().mode("overwrite").format("parquet").save(args[0]);
long end = System.nanoTime();
System.out.println((end - start) / 1E9 + " sec");
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureElementsWord2VecEncoder method main.
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 0 && args.length != 2) {
System.err.println("Usage: " + SecondaryStructureElementsWord2VecEncoder.class.getSimpleName() + " [<outputFilePath> + <fileFormat>]");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureWord2VecEncoder.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant Pisces
// subset set (<=20% seq. identity) of L-protein chains
int sequenceIdentity = 20;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
int segmentLength = 11;
// extract helical sequence segments
Dataset<Row> data = SecondaryStructureElementExtractor.getDataset(pdb, "H", segmentLength);
System.out.println(data.count());
data.show(10, false);
// add Word2Vec encoded feature vector
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
int n = 2;
int windowSize = (segmentLength - 1) / 2;
int vectorSize = 50;
data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);
data.show(50, false);
// optionally, save results
if (args.length > 0) {
if (args[1].equals("json")) {
// coalesce data into a single file
data = data.coalesce(1);
}
data.write().mode("overwrite").format(args[1]).save(args[0]);
}
long end = System.nanoTime();
System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class SecondaryStructureShiftedWord2VecEncoder method main.
/**
* @param args args[0] outputFilePath, args[1] outputFormat (json|parquet)
* @throws IOException
* @throws StructureException
*/
public static void main(String[] args) throws IOException {
String path = MmtfReader.getMmtfReducedPath();
if (args.length != 2) {
System.err.println("Usage: " + SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
System.exit(1);
}
long start = System.nanoTime();
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// read MMTF Hadoop sequence file and create a non-redundant set (<=20% seq. identity)
// of L-protein chains
int sequenceIdentity = 20;
double resolution = 3.0;
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc).flatMapToPair(new StructureToPolymerChains()).filter(new Pisces(sequenceIdentity, resolution));
// get content
int segmentLength = 11;
Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);
// create a Word2Vector representation of the protein sequences
ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
int windowSize = (segmentLength - 1) / 2;
// dimension of feature vector (50)
int vectorSize = 50;
data = encoder.shifted3GramWord2VecEncode(windowSize, vectorSize).cache();
data.printSchema();
data.show(25, false);
if (args[1].equals("json")) {
// coalesce data into a single file
data = data.coalesce(1);
}
data.write().mode("overwrite").format(args[1]).save(args[0]);
long end = System.nanoTime();
System.out.println(TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");
}
use of org.rcsb.mmtf.api.StructureDataInterface in project mmtf-spark by sbl-sdsc.
the class MmtfReader method readMmtfFiles.
/**
* Reads uncompressed and compressed MMTF files recursively from
* a given directory.
* This methods reads files with the mmtf or mmtf.gz extension.
*
* @param path Path to MMTF files
* @param sc Spark context
* @return structure data as keyword/value pairs
*/
public static JavaPairRDD<String, StructureDataInterface> readMmtfFiles(String path, JavaSparkContext sc) {
return sc.parallelize(getFiles(path)).mapToPair(new PairFunction<File, String, StructureDataInterface>() {
private static final long serialVersionUID = 9018971417443154996L;
public Tuple2<String, StructureDataInterface> call(File f) throws Exception {
try {
if (f.toString().contains(".mmtf.gz")) {
InputStream in = new FileInputStream(f);
MmtfStructure mmtf = new MessagePackSerialization().deserialize(new GZIPInputStream(in));
return new Tuple2<String, StructureDataInterface>(f.getName().substring(0, f.getName().indexOf(".mmtf")), new GenericDecoder(mmtf));
} else if (f.toString().contains(".mmtf")) {
InputStream in = new FileInputStream(f);
MmtfStructure mmtf = new MessagePackSerialization().deserialize(in);
return new Tuple2<String, StructureDataInterface>(f.getName().substring(0, f.getName().indexOf(".mmtf")), new GenericDecoder(mmtf));
} else
return null;
} catch (Exception e) {
System.out.println(e);
return null;
}
}
}).filter(t -> t != null);
}
Aggregations