use of ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceFunction in project legato by DOREMUS-ANR.
the class Matchifier method match.
public void match() throws Exception {
LEGATO legato = LEGATO.getInstance();
// Final links
MapList mapList = new MapList();
// Recall++
MapList mapList1 = new MapList();
// Precision++
MapList mapList2 = new MapList();
/**
*******
* Getting vectors from "Source" and "Target" datasets
********
*/
VectorGenerator vectorGenerator = new VectorGenerator();
vectorGenerator.GetAllTerms();
// List of "Source" and "Target" TF-IDF vectors
DocVector[] docVectors = vectorGenerator.GetDocumentVectors();
// List of "Source" TF-IDF vectors with their "docName"
HashMap<String, double[]> srcMap = new HashMap<String, double[]>();
// List of "Target" TF-IDF vectors with their "docName"
HashMap<String, double[]> tgtMap = new HashMap<String, double[]>();
for (// Identify "Source" and "Target" vectors
DocVector doc : // Identify "Source" and "Target" vectors
docVectors) {
double[] vector = doc.getVector();
if (doc.parentFolder.equals("source"))
srcMap.put(doc.docName, vector);
else if (doc.parentFolder.equals("target"))
tgtMap.put(doc.docName, vector);
}
/**
******
* "Hierarchical Clustering" on "Source" and "Target" datasets.
* For each pair --> apply RANkey and link instances based on the
* best key.
*******
*/
// List of "Source" clusters
ClusterList srcClS = Clustering.getClusters(srcMap);
// List of "target" clusters
ClusterList tgtClS = Clustering.getClusters(tgtMap);
/**
******
* RANKey
*******
*/
int pairNumber = 0;
// All pairs of clusters will be contained in folder "clusters"
File dir = new File(legato.getPath() + File.separator + "clusters");
dir.mkdirs();
DistanceFunction distanceFunction = new DistanceCorrelation();
for (// For each cluster from "Source" dataset
Cluster clust1 : // For each cluster from "Source" dataset
srcClS) {
for (// For each cluster from "Target" dataset
Cluster clust2 : // For each cluster from "Target" dataset
tgtClS) {
DoubleArray cs = new DoubleArray(clust1.getCentroid().elements);
DoubleArray ct = new DoubleArray(clust2.getCentroid().elements);
if (CosineSimilarity.cosineSimilarity(clust1.getCentroid(), clust2.getCentroid()) > 0.4) {
if (// If both clusters contain more than one instance
clust1.size() > 1 && clust2.size() > 1) {
pairNumber = pairNumber + 1;
// Both clusters will be contained in folder "pairNumber"
File dirClusters = new File(dir.getAbsolutePath() + File.separator + pairNumber);
dirClusters.mkdirs();
/**
*****
* Retrieve RDF Model from the 2 clusters
******
*/
Model srcModel = ModelManager.loadModel(legato.src.toString());
Model model1 = ModelFactory.createDefaultModel();
String[] resources = clust1.getIDs().split("\n");
for (String rsrce : resources) {
String uri = legato.getSrcURIs().get(rsrce);
Resource resource = ResourceFactory.createResource(uri);
model1.add(CBDBuilder.getCBD(srcModel, resource));
}
Model tgtModel = ModelManager.loadModel(legato.tgt.toString());
Model model2 = ModelFactory.createDefaultModel();
resources = clust2.getIDs().split("\n");
for (String rsrce : resources) {
String uri = legato.getTgtURIs().get(rsrce);
Resource resource = ResourceFactory.createResource(uri);
model2.add(CBDBuilder.getCBD(tgtModel, resource));
}
/**
**********
* Execute RANKey
***********
*/
HashSet<String> bestKey = KeysClassifier.getBestKey(model1, model2, dirClusters);
if (!(bestKey == null)) {
/**
**********
* Execute SILK
***********
*/
SilkConfig.config(bestKey, dirClusters, dirClusters.toString() + File.separator + "source.nt", dirClusters.toString() + File.separator + "target.nt");
SILK.link(dirClusters.toString());
File file = new File(dirClusters.toString() + File.separator + "links.rdf");
AlignmentParser aparser = new AlignmentParser(0);
Alignment links = aparser.parse(file.toURI());
for (Cell cell : links) {
mapList2.add(cell.getObject1AsURI().toString(), cell.getObject2AsURI().toString(), cell.getStrength());
}
}
} else if (clust1.size() == 1 && clust2.size() == 1) {
// mapList2.add("http://data.doremus.org/expression/"+clust1.getExemplar().getID(), "http://data.doremus.org/expression/"+clust2.getExemplar().getID(), CosineSimilarity.cosineSimilarity(clust1.getCentroid(), clust2.getCentroid()));
}
}
}
}
/**
***
* Comparison
****
*/
System.out.println("comparison");
for (int i = 0; i < docVectors.length; i++) {
DocVector srcDoc = docVectors[i];
String tgtDoc = null;
double simVal = 0;
for (int j = 0; j < docVectors.length; j++) {
if ((srcDoc.parentFolder.equals("source")) && (docVectors[j].parentFolder.equals("target"))) {
if ((tgtDoc == null) || (CosineSimilarity.cosineSimilarity(srcDoc, docVectors[j]) > simVal)) {
tgtDoc = docVectors[j].docName;
simVal = CosineSimilarity.cosineSimilarity(srcDoc, docVectors[j]);
}
}
}
if ((tgtDoc != null) && simVal >= legato.getThreshold()) {
/* Model srcModel = ModelManager.loadModel(legato.src.toString());
Model model1 = ModelFactory.createDefaultModel();
Resource rsrce1 = model1.createResource(legato.getSrcURIs().get(srcDoc.docName));
String str1 = legato.getType(rsrce1, srcModel).toString();
Model tgtModel = ModelManager.loadModel(legato.tgt.toString());
Model model2 = ModelFactory.createDefaultModel();
Resource rsrce2 = model2.createResource(legato.getTgtURIs().get(tgtDoc));
String str2 = legato.getType(rsrce2, tgtModel).toString();
if (str1.equals(str2)) */
mapList1.add(legato.getSrcURIs().get(srcDoc.docName), legato.getTgtURIs().get(tgtDoc), simVal);
}
}
/**
***********
* Link repairing
***********
*/
for (Map map1 : mapList1) {
boolean exist = false;
for (Map map2 : mapList2) {
if (map1.getSourceURI().equals(map2.getSourceURI())) {
if (map1.getTargetURI().equals(map2.getTargetURI()))
System.out.println("OUI");
else
System.out.println("NON");
exist = true;
mapList.add(map2);
}
}
if (exist == false)
mapList.add(map1);
}
for (Map map2 : mapList2) {
boolean exist = false;
for (Map map1 : mapList1) {
if (map2.getSourceURI().equals(map1.getSourceURI())) {
exist = true;
}
}
if (exist == false) {
System.out.println("+1");
mapList.add(map2);
}
}
/**
*******
** Create and save the alignment file
********
*/
File dirr = new File(legato.getPath() + File.separator + "docs");
delete(dirr);
File dirind = new File(legato.getPath() + File.separator + "index");
delete(dirind);
File srcFile = new File(legato.getPath() + File.separator + "source.rdf");
srcFile.deleteOnExit();
File tgtFile = new File(legato.getPath() + File.separator + "target.rdf");
tgtFile.deleteOnExit();
File txtFile = new File(legato.getPath() + File.separator + "nom.txt");
txtFile.deleteOnExit();
Align.saveMappings(mapList);
}
use of ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceFunction in project legato by DOREMUS-ANR.
the class Clustering method getClusters.
public static ClusterList getClusters(HashMap<String, double[]> docs) throws NumberFormatException, IOException {
// double maxdistance = 0.415; //meilleur seuil sur DS_SM
double maxdistance = 0.2;
DistanceFunction distanceFunction = new DistanceCorrelation();
HierarchicalClustering algo = new HierarchicalClustering();
List<ClusterWithMean> clusters = algo.runAlgorithm(docs, maxdistance, distanceFunction);
ClusterList clusterList = new ClusterList();
for (// For each cluster
ClusterWithMean clust : // For each cluster
algo.clusters) {
Cluster cluster = new Cluster();
for (// For each vector
DoubleArray vector : // For each vector
clust.getVectors()) {
for (Entry<String, double[]> doc : docs.entrySet()) {
if (Arrays.equals(doc.getValue(), vector.data)) {
DocVec docVec = new DocVec(doc.getKey(), doc.getValue());
cluster.add(docVec);
}
}
}
clusterList.add(cluster);
}
clusterList.updateCentroids();
clusterList.updateExemplars();
return clusterList;
}
Aggregations