use of ca.pfv.spmf.patterns.cluster.DoubleArray in project legato by DOREMUS-ANR.
the class Matchifier method match.
public void match() throws Exception {
LEGATO legato = LEGATO.getInstance();
// Final links
MapList mapList = new MapList();
// Recall++
MapList mapList1 = new MapList();
// Precision++
MapList mapList2 = new MapList();
/**
*******
* Getting vectors from "Source" and "Target" datasets
********
*/
VectorGenerator vectorGenerator = new VectorGenerator();
vectorGenerator.GetAllTerms();
// List of "Source" and "Target" TF-IDF vectors
DocVector[] docVectors = vectorGenerator.GetDocumentVectors();
// List of "Source" TF-IDF vectors with their "docName"
HashMap<String, double[]> srcMap = new HashMap<String, double[]>();
// List of "Target" TF-IDF vectors with their "docName"
HashMap<String, double[]> tgtMap = new HashMap<String, double[]>();
for (// Identify "Source" and "Target" vectors
DocVector doc : // Identify "Source" and "Target" vectors
docVectors) {
double[] vector = doc.getVector();
if (doc.parentFolder.equals("source"))
srcMap.put(doc.docName, vector);
else if (doc.parentFolder.equals("target"))
tgtMap.put(doc.docName, vector);
}
/**
******
* "Hierarchical Clustering" on "Source" and "Target" datasets.
* For each pair --> apply RANkey and link instances based on the
* best key.
*******
*/
// List of "Source" clusters
ClusterList srcClS = Clustering.getClusters(srcMap);
// List of "target" clusters
ClusterList tgtClS = Clustering.getClusters(tgtMap);
/**
******
* RANKey
*******
*/
int pairNumber = 0;
// All pairs of clusters will be contained in folder "clusters"
File dir = new File(legato.getPath() + File.separator + "clusters");
dir.mkdirs();
DistanceFunction distanceFunction = new DistanceCorrelation();
for (// For each cluster from "Source" dataset
Cluster clust1 : // For each cluster from "Source" dataset
srcClS) {
for (// For each cluster from "Target" dataset
Cluster clust2 : // For each cluster from "Target" dataset
tgtClS) {
DoubleArray cs = new DoubleArray(clust1.getCentroid().elements);
DoubleArray ct = new DoubleArray(clust2.getCentroid().elements);
if (CosineSimilarity.cosineSimilarity(clust1.getCentroid(), clust2.getCentroid()) > 0.4) {
if (// If both clusters contain more than one instance
clust1.size() > 1 && clust2.size() > 1) {
pairNumber = pairNumber + 1;
// Both clusters will be contained in folder "pairNumber"
File dirClusters = new File(dir.getAbsolutePath() + File.separator + pairNumber);
dirClusters.mkdirs();
/**
*****
* Retrieve RDF Model from the 2 clusters
******
*/
Model srcModel = ModelManager.loadModel(legato.src.toString());
Model model1 = ModelFactory.createDefaultModel();
String[] resources = clust1.getIDs().split("\n");
for (String rsrce : resources) {
String uri = legato.getSrcURIs().get(rsrce);
Resource resource = ResourceFactory.createResource(uri);
model1.add(CBDBuilder.getCBD(srcModel, resource));
}
Model tgtModel = ModelManager.loadModel(legato.tgt.toString());
Model model2 = ModelFactory.createDefaultModel();
resources = clust2.getIDs().split("\n");
for (String rsrce : resources) {
String uri = legato.getTgtURIs().get(rsrce);
Resource resource = ResourceFactory.createResource(uri);
model2.add(CBDBuilder.getCBD(tgtModel, resource));
}
/**
**********
* Execute RANKey
***********
*/
HashSet<String> bestKey = KeysClassifier.getBestKey(model1, model2, dirClusters);
if (!(bestKey == null)) {
/**
**********
* Execute SILK
***********
*/
SilkConfig.config(bestKey, dirClusters, dirClusters.toString() + File.separator + "source.nt", dirClusters.toString() + File.separator + "target.nt");
SILK.link(dirClusters.toString());
File file = new File(dirClusters.toString() + File.separator + "links.rdf");
AlignmentParser aparser = new AlignmentParser(0);
Alignment links = aparser.parse(file.toURI());
for (Cell cell : links) {
mapList2.add(cell.getObject1AsURI().toString(), cell.getObject2AsURI().toString(), cell.getStrength());
}
}
} else if (clust1.size() == 1 && clust2.size() == 1) {
// mapList2.add("http://data.doremus.org/expression/"+clust1.getExemplar().getID(), "http://data.doremus.org/expression/"+clust2.getExemplar().getID(), CosineSimilarity.cosineSimilarity(clust1.getCentroid(), clust2.getCentroid()));
}
}
}
}
/**
***
* Comparison
****
*/
System.out.println("comparison");
for (int i = 0; i < docVectors.length; i++) {
DocVector srcDoc = docVectors[i];
String tgtDoc = null;
double simVal = 0;
for (int j = 0; j < docVectors.length; j++) {
if ((srcDoc.parentFolder.equals("source")) && (docVectors[j].parentFolder.equals("target"))) {
if ((tgtDoc == null) || (CosineSimilarity.cosineSimilarity(srcDoc, docVectors[j]) > simVal)) {
tgtDoc = docVectors[j].docName;
simVal = CosineSimilarity.cosineSimilarity(srcDoc, docVectors[j]);
}
}
}
if ((tgtDoc != null) && simVal >= legato.getThreshold()) {
/* Model srcModel = ModelManager.loadModel(legato.src.toString());
Model model1 = ModelFactory.createDefaultModel();
Resource rsrce1 = model1.createResource(legato.getSrcURIs().get(srcDoc.docName));
String str1 = legato.getType(rsrce1, srcModel).toString();
Model tgtModel = ModelManager.loadModel(legato.tgt.toString());
Model model2 = ModelFactory.createDefaultModel();
Resource rsrce2 = model2.createResource(legato.getTgtURIs().get(tgtDoc));
String str2 = legato.getType(rsrce2, tgtModel).toString();
if (str1.equals(str2)) */
mapList1.add(legato.getSrcURIs().get(srcDoc.docName), legato.getTgtURIs().get(tgtDoc), simVal);
}
}
/**
***********
* Link repairing
***********
*/
for (Map map1 : mapList1) {
boolean exist = false;
for (Map map2 : mapList2) {
if (map1.getSourceURI().equals(map2.getSourceURI())) {
if (map1.getTargetURI().equals(map2.getTargetURI()))
System.out.println("OUI");
else
System.out.println("NON");
exist = true;
mapList.add(map2);
}
}
if (exist == false)
mapList.add(map1);
}
for (Map map2 : mapList2) {
boolean exist = false;
for (Map map1 : mapList1) {
if (map2.getSourceURI().equals(map1.getSourceURI())) {
exist = true;
}
}
if (exist == false) {
System.out.println("+1");
mapList.add(map2);
}
}
/**
*******
** Create and save the alignment file
********
*/
File dirr = new File(legato.getPath() + File.separator + "docs");
delete(dirr);
File dirind = new File(legato.getPath() + File.separator + "index");
delete(dirind);
File srcFile = new File(legato.getPath() + File.separator + "source.rdf");
srcFile.deleteOnExit();
File tgtFile = new File(legato.getPath() + File.separator + "target.rdf");
tgtFile.deleteOnExit();
File txtFile = new File(legato.getPath() + File.separator + "nom.txt");
txtFile.deleteOnExit();
Align.saveMappings(mapList);
}
use of ca.pfv.spmf.patterns.cluster.DoubleArray in project legato by DOREMUS-ANR.
the class PropertyHandler method clean.
/**
*****
* This class deletes problematic properties
******
*/
public static void clean(String srcPath, String tgtPath) throws IOException {
LEGATO legato = LEGATO.getInstance();
Model srcModel = ModelManager.loadModel(srcPath);
Model tgtModel = ModelManager.loadModel(tgtPath);
Model s = ModelFactory.createDefaultModel();
Model t = ModelFactory.createDefaultModel();
s = ModelManager.rewrite(srcModel, false);
t = ModelManager.rewrite(tgtModel, false);
Model mergedModel = ModelFactory.createDefaultModel();
mergedModel.add(s);
mergedModel.add(t);
List<Resource> properties = getDistinctProperties(mergedModel);
System.out.println(legato.getPropList());
HashMap<String, String> propScoreList = new HashMap<String, String>();
properties.forEach((property) -> {
propScoreList.put(property.toString(), String.valueOf(getScore(property, mergedModel)));
});
ValueComparator<String> comp = new ValueComparator<String>(propScoreList);
TreeMap<String, String> mapTriee = new TreeMap<String, String>(comp);
mapTriee.putAll(propScoreList);
System.out.println(mapTriee);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < mapTriee.entrySet().size(); i++) {
sb.append(Double.valueOf((String) mapTriee.values().toArray()[i]) + "\n");
}
;
FileManager.create("nom", sb.toString().trim());
int minPts = 1;
double epsilon = 5d;
AlgoDBSCAN algo = new AlgoDBSCAN();
List<Cluster> clusters = algo.runAlgorithm(legato.getPath() + File.separator + "nom.txt", minPts, epsilon, "\n");
algo.printStatistics();
double highMean = 0;
double[] heterCluster = null;
for (Cluster cluster : clusters) {
double[] arr = new double[cluster.getVectors().size()];
int i = 0;
for (DoubleArray dataPoint : cluster.getVectors()) {
arr[i++] = dataPoint.data[0];
}
A a = new A(arr);
if (highMean < a.getMean()) {
highMean = a.getMean();
heterCluster = arr;
}
;
}
List<String> propList = new ArrayList<String>();
Iterator it = mapTriee.entrySet().iterator();
while (it.hasNext()) {
Entry<String, String> entry = (Entry<String, String>) it.next();
boolean f = false;
for (int i = 0; i < heterCluster.length; i++) {
if (String.valueOf(heterCluster[i]).equals(entry.getValue()))
propList.add(entry.getKey());
;
}
}
System.out.println(propList);
srcModel = ModelManager.rewrite(srcModel, true);
System.out.println("source");
tgtModel = ModelManager.rewrite(tgtModel, true);
Model srcFinalModel = ModelFactory.createDefaultModel();
srcModel.listStatements().toSet().forEach((stmt) -> {
Property property = stmt.getPredicate();
if (!(propList.contains(property.toString()))) {
srcFinalModel.add(stmt);
}
});
Model tgtFinalModel = ModelFactory.createDefaultModel();
tgtModel.listStatements().toSet().forEach((stmt) -> {
Property property = stmt.getPredicate();
if (!propList.contains(property.toString())) {
tgtFinalModel.add(stmt);
}
});
// FileManager.createRDFile(new File(legato.getPath()+"store"), "source", srcFinalModel, "TTL");
// FileManager.createRDFile(new File(legato.getPath()+"store"), "target", tgtFinalModel, "TTL");
legato.setSource(FileManager.getCreatedRDFile("source", srcFinalModel));
legato.setTarget(FileManager.getCreatedRDFile("target", tgtFinalModel));
System.out.println("finish");
}
use of ca.pfv.spmf.patterns.cluster.DoubleArray in project legato by DOREMUS-ANR.
the class Clustering method getClusters.
public static ClusterList getClusters(HashMap<String, double[]> docs) throws NumberFormatException, IOException {
// double maxdistance = 0.415; //meilleur seuil sur DS_SM
double maxdistance = 0.2;
DistanceFunction distanceFunction = new DistanceCorrelation();
HierarchicalClustering algo = new HierarchicalClustering();
List<ClusterWithMean> clusters = algo.runAlgorithm(docs, maxdistance, distanceFunction);
ClusterList clusterList = new ClusterList();
for (// For each cluster
ClusterWithMean clust : // For each cluster
algo.clusters) {
Cluster cluster = new Cluster();
for (// For each vector
DoubleArray vector : // For each vector
clust.getVectors()) {
for (Entry<String, double[]> doc : docs.entrySet()) {
if (Arrays.equals(doc.getValue(), vector.data)) {
DocVec docVec = new DocVec(doc.getKey(), doc.getValue());
cluster.add(docVec);
}
}
}
clusterList.add(cluster);
}
clusterList.updateCentroids();
clusterList.updateExemplars();
return clusterList;
}
use of ca.pfv.spmf.patterns.cluster.DoubleArray in project legato by DOREMUS-ANR.
the class HierarchicalClustering method mergeTheClosestCluster.
/**
* Merge the two closest clusters in terms of distance.
* @return true if a merge was done, otherwise false.
*/
private boolean mergeTheClosestCluster() {
// These variables will contain the two closest clusters that
// can be merged
ClusterWithMean clusterToMerge1 = null;
ClusterWithMean clusterToMerge2 = null;
double minClusterDistance = Integer.MAX_VALUE;
// by comparing all pairs of clusters i and j
for (int i = 0; i < clusters.size(); i++) {
for (int j = i + 1; j < clusters.size(); j++) {
// calculate the distance between i and j
double distance = distanceFunction.calculateDistance(clusters.get(i).getmean(), clusters.get(j).getmean());
// and if it is the smallest distance until now
if (distance < minClusterDistance && distance <= maxDistance) {
// record this pair of clusters
minClusterDistance = distance;
clusterToMerge1 = clusters.get(i);
clusterToMerge2 = clusters.get(j);
}
}
}
// if no close clusters were found, return false
if (clusterToMerge1 == null) {
return false;
}
// else, merge the two closest clusters
for (DoubleArray vector : clusterToMerge2.getVectors()) {
clusterToMerge1.addVector(vector);
}
// after mergint, we need to recompute the mean of the resulting cluster
clusterToMerge1.recomputeClusterMean();
// we delete the cluster that was merged
clusters.remove(clusterToMerge2);
// increase iteration count for statistics
iterationCount++;
return true;
}
use of ca.pfv.spmf.patterns.cluster.DoubleArray in project legato by DOREMUS-ANR.
the class HierarchicalClustering method runAlgorithm.
public List<ClusterWithMean> runAlgorithm(HashMap<String, double[]> docs, double maxDistance, DistanceFunction distanceFunction) throws NumberFormatException, IOException {
startTimestamp = System.currentTimeMillis();
this.maxDistance = maxDistance;
this.distanceFunction = distanceFunction;
// create an empty list of clusters
clusters = new ArrayList<ClusterWithMean>();
/**
**
* Add each vector to an individual cluster.
***
*/
for (Entry<String, double[]> doc : docs.entrySet()) {
double[] vector = doc.getValue();
// create a DoubleArray object with the vector
DoubleArray theVector = new DoubleArray(vector);
// Initiallly we create a cluster for each vector
ClusterWithMean cluster = new ClusterWithMean(vector.length);
cluster.addVector(theVector);
cluster.setMean(theVector.clone());
clusters.add(cluster);
}
// (2) Loop to combine the two closest clusters into a bigger cluster
// until no clusters can be combined.
boolean changed = false;
do {
// merge the two closest clusters
changed = mergeTheClosestCluster();
// record memory usage
MemoryLogger.getInstance().checkMemory();
} while (changed);
// record end time
endTimestamp = System.currentTimeMillis();
// return the clusters
return clusters;
}
Aggregations