use of legato.LEGATO in project legato by DOREMUS-ANR.
the class Matchifier method match.
public void match() throws Exception {
LEGATO legato = LEGATO.getInstance();
// Final links
MapList mapList = new MapList();
// Recall++
MapList mapList1 = new MapList();
// Precision++
MapList mapList2 = new MapList();
/**
*******
* Getting vectors from "Source" and "Target" datasets
********
*/
VectorGenerator vectorGenerator = new VectorGenerator();
vectorGenerator.GetAllTerms();
// List of "Source" and "Target" TF-IDF vectors
DocVector[] docVectors = vectorGenerator.GetDocumentVectors();
// List of "Source" TF-IDF vectors with their "docName"
HashMap<String, double[]> srcMap = new HashMap<String, double[]>();
// List of "Target" TF-IDF vectors with their "docName"
HashMap<String, double[]> tgtMap = new HashMap<String, double[]>();
for (// Identify "Source" and "Target" vectors
DocVector doc : // Identify "Source" and "Target" vectors
docVectors) {
double[] vector = doc.getVector();
if (doc.parentFolder.equals("source"))
srcMap.put(doc.docName, vector);
else if (doc.parentFolder.equals("target"))
tgtMap.put(doc.docName, vector);
}
/**
******
* "Hierarchical Clustering" on "Source" and "Target" datasets.
* For each pair --> apply RANkey and link instances based on the
* best key.
*******
*/
// List of "Source" clusters
ClusterList srcClS = Clustering.getClusters(srcMap);
// List of "target" clusters
ClusterList tgtClS = Clustering.getClusters(tgtMap);
/**
******
* RANKey
*******
*/
int pairNumber = 0;
// All pairs of clusters will be contained in folder "clusters"
File dir = new File(legato.getPath() + File.separator + "clusters");
dir.mkdirs();
DistanceFunction distanceFunction = new DistanceCorrelation();
for (// For each cluster from "Source" dataset
Cluster clust1 : // For each cluster from "Source" dataset
srcClS) {
for (// For each cluster from "Target" dataset
Cluster clust2 : // For each cluster from "Target" dataset
tgtClS) {
DoubleArray cs = new DoubleArray(clust1.getCentroid().elements);
DoubleArray ct = new DoubleArray(clust2.getCentroid().elements);
if (CosineSimilarity.cosineSimilarity(clust1.getCentroid(), clust2.getCentroid()) > 0.4) {
if (// If both clusters contain more than one instance
clust1.size() > 1 && clust2.size() > 1) {
pairNumber = pairNumber + 1;
// Both clusters will be contained in folder "pairNumber"
File dirClusters = new File(dir.getAbsolutePath() + File.separator + pairNumber);
dirClusters.mkdirs();
/**
*****
* Retrieve RDF Model from the 2 clusters
******
*/
Model srcModel = ModelManager.loadModel(legato.src.toString());
Model model1 = ModelFactory.createDefaultModel();
String[] resources = clust1.getIDs().split("\n");
for (String rsrce : resources) {
String uri = legato.getSrcURIs().get(rsrce);
Resource resource = ResourceFactory.createResource(uri);
model1.add(CBDBuilder.getCBD(srcModel, resource));
}
Model tgtModel = ModelManager.loadModel(legato.tgt.toString());
Model model2 = ModelFactory.createDefaultModel();
resources = clust2.getIDs().split("\n");
for (String rsrce : resources) {
String uri = legato.getTgtURIs().get(rsrce);
Resource resource = ResourceFactory.createResource(uri);
model2.add(CBDBuilder.getCBD(tgtModel, resource));
}
/**
**********
* Execute RANKey
***********
*/
HashSet<String> bestKey = KeysClassifier.getBestKey(model1, model2, dirClusters);
if (!(bestKey == null)) {
/**
**********
* Execute SILK
***********
*/
SilkConfig.config(bestKey, dirClusters, dirClusters.toString() + File.separator + "source.nt", dirClusters.toString() + File.separator + "target.nt");
SILK.link(dirClusters.toString());
File file = new File(dirClusters.toString() + File.separator + "links.rdf");
AlignmentParser aparser = new AlignmentParser(0);
Alignment links = aparser.parse(file.toURI());
for (Cell cell : links) {
mapList2.add(cell.getObject1AsURI().toString(), cell.getObject2AsURI().toString(), cell.getStrength());
}
}
} else if (clust1.size() == 1 && clust2.size() == 1) {
// mapList2.add("http://data.doremus.org/expression/"+clust1.getExemplar().getID(), "http://data.doremus.org/expression/"+clust2.getExemplar().getID(), CosineSimilarity.cosineSimilarity(clust1.getCentroid(), clust2.getCentroid()));
}
}
}
}
/**
***
* Comparison
****
*/
System.out.println("comparison");
for (int i = 0; i < docVectors.length; i++) {
DocVector srcDoc = docVectors[i];
String tgtDoc = null;
double simVal = 0;
for (int j = 0; j < docVectors.length; j++) {
if ((srcDoc.parentFolder.equals("source")) && (docVectors[j].parentFolder.equals("target"))) {
if ((tgtDoc == null) || (CosineSimilarity.cosineSimilarity(srcDoc, docVectors[j]) > simVal)) {
tgtDoc = docVectors[j].docName;
simVal = CosineSimilarity.cosineSimilarity(srcDoc, docVectors[j]);
}
}
}
if ((tgtDoc != null) && simVal >= legato.getThreshold()) {
/* Model srcModel = ModelManager.loadModel(legato.src.toString());
Model model1 = ModelFactory.createDefaultModel();
Resource rsrce1 = model1.createResource(legato.getSrcURIs().get(srcDoc.docName));
String str1 = legato.getType(rsrce1, srcModel).toString();
Model tgtModel = ModelManager.loadModel(legato.tgt.toString());
Model model2 = ModelFactory.createDefaultModel();
Resource rsrce2 = model2.createResource(legato.getTgtURIs().get(tgtDoc));
String str2 = legato.getType(rsrce2, tgtModel).toString();
if (str1.equals(str2)) */
mapList1.add(legato.getSrcURIs().get(srcDoc.docName), legato.getTgtURIs().get(tgtDoc), simVal);
}
}
/**
***********
* Link repairing
***********
*/
for (Map map1 : mapList1) {
boolean exist = false;
for (Map map2 : mapList2) {
if (map1.getSourceURI().equals(map2.getSourceURI())) {
if (map1.getTargetURI().equals(map2.getTargetURI()))
System.out.println("OUI");
else
System.out.println("NON");
exist = true;
mapList.add(map2);
}
}
if (exist == false)
mapList.add(map1);
}
for (Map map2 : mapList2) {
boolean exist = false;
for (Map map1 : mapList1) {
if (map2.getSourceURI().equals(map1.getSourceURI())) {
exist = true;
}
}
if (exist == false) {
System.out.println("+1");
mapList.add(map2);
}
}
/**
*******
** Create and save the alignment file
********
*/
File dirr = new File(legato.getPath() + File.separator + "docs");
delete(dirr);
File dirind = new File(legato.getPath() + File.separator + "index");
delete(dirind);
File srcFile = new File(legato.getPath() + File.separator + "source.rdf");
srcFile.deleteOnExit();
File tgtFile = new File(legato.getPath() + File.separator + "target.rdf");
tgtFile.deleteOnExit();
File txtFile = new File(legato.getPath() + File.separator + "nom.txt");
txtFile.deleteOnExit();
Align.saveMappings(mapList);
}
use of legato.LEGATO in project legato by DOREMUS-ANR.
the class ModelManager method rewrite.
/**
********
** Place all Literals (in resources CBD) to a distance = 1
*********
*/
public static Model rewrite(Model model, boolean ok) throws IOException {
LEGATO legato = LEGATO.getInstance();
Model finalModel = ModelFactory.createDefaultModel();
model.listSubjects().toSet().forEach((resource) -> {
// Parse all resources
if (// If the current resource belongs to a given "type"
legato.hasType(resource) == true) {
Model m = CBDBuilder.getCBD(model, resource);
if (ok == true) {
m.add(CBDBuilder.getCBDDirectPredecessors(model, resource));
m.add(CBDBuilder.getCBDDirectSuccessors(model, resource));
}
try {
m.add(ModelManager.parseCBD(m));
} catch (IOException e1) {
e1.printStackTrace();
}
m.listStatements().toSet().forEach((stmt) -> {
Resource sub = stmt.getSubject();
Property prop = stmt.getPredicate();
RDFNode object = stmt.getObject();
if (// Parse all literals
object.isLiteral() == true) {
// A filter which accepts statements whose predicate matches one of a collection of predicates held by the filter object.
Path path = OntTools.findShortestPath(m, resource, object, Filter.any);
if (!(path == null)) {
// Get the successive properties from the path
List<Property> properties = getPropFromPath(path);
if (legato.getPropList().existProperty(properties) == false) {
int indice = legato.getPropList().size();
finalModel.createResource(resource.toString()).addProperty(finalModel.createProperty("http://model.org/property" + indice), object);
try {
legato.addToPropList("http://model.org/property" + indice, properties);
} catch (IOException e) {
}
} else {
finalModel.createResource(resource.toString()).addProperty(finalModel.createProperty(legato.getPropList().getPropertyName(properties)), object);
}
} else {
String sparqlQueryString = "select ?predec where {" + "?predec ?prop <" + resource + ">." + "}";
Query query = QueryFactory.create(sparqlQueryString);
QueryExecution qexec = QueryExecutionFactory.create(query, model);
ResultSet queryResults = qexec.execSelect();
while (queryResults.hasNext()) {
QuerySolution qs = queryResults.nextSolution();
final PathManager.Path path2 = PathManager.findShortestPath(model, qs.getResource("?predec"), object, prop);
if (!(path2 == null)) {
// Get the successive properties from the path
List<Property> properties = getPropFromPath(path2);
if (legato.getPropList().existProperty(properties) == false) {
int indice = legato.getPropList().size();
finalModel.createResource(resource.toString()).addProperty(finalModel.createProperty("http://model.org/property" + indice), object);
try {
legato.addToPropList("http://model.org/property" + indice, properties);
} catch (IOException e) {
}
} else {
finalModel.createResource(resource.toString()).addProperty(finalModel.createProperty(legato.getPropList().getPropertyName(properties)), object);
}
}
}
qexec.close();
}
} else if (prop.equals(RDF.type) && (legato.hasType(sub))) {
finalModel.createResource(resource.toString()).addProperty(RDF.type, object);
}
// else
// finalModel.createResource(resource.toString()).addProperty(prop, object);
});
}
});
return finalModel;
}
use of legato.LEGATO in project legato by DOREMUS-ANR.
the class PropertyHandler method clean.
/**
*****
* This class deletes problematic properties
******
*/
public static void clean(String srcPath, String tgtPath) throws IOException {
LEGATO legato = LEGATO.getInstance();
Model srcModel = ModelManager.loadModel(srcPath);
Model tgtModel = ModelManager.loadModel(tgtPath);
Model s = ModelFactory.createDefaultModel();
Model t = ModelFactory.createDefaultModel();
s = ModelManager.rewrite(srcModel, false);
t = ModelManager.rewrite(tgtModel, false);
Model mergedModel = ModelFactory.createDefaultModel();
mergedModel.add(s);
mergedModel.add(t);
List<Resource> properties = getDistinctProperties(mergedModel);
System.out.println(legato.getPropList());
HashMap<String, String> propScoreList = new HashMap<String, String>();
properties.forEach((property) -> {
propScoreList.put(property.toString(), String.valueOf(getScore(property, mergedModel)));
});
ValueComparator<String> comp = new ValueComparator<String>(propScoreList);
TreeMap<String, String> mapTriee = new TreeMap<String, String>(comp);
mapTriee.putAll(propScoreList);
System.out.println(mapTriee);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < mapTriee.entrySet().size(); i++) {
sb.append(Double.valueOf((String) mapTriee.values().toArray()[i]) + "\n");
}
;
FileManager.create("nom", sb.toString().trim());
int minPts = 1;
double epsilon = 5d;
AlgoDBSCAN algo = new AlgoDBSCAN();
List<Cluster> clusters = algo.runAlgorithm(legato.getPath() + File.separator + "nom.txt", minPts, epsilon, "\n");
algo.printStatistics();
double highMean = 0;
double[] heterCluster = null;
for (Cluster cluster : clusters) {
double[] arr = new double[cluster.getVectors().size()];
int i = 0;
for (DoubleArray dataPoint : cluster.getVectors()) {
arr[i++] = dataPoint.data[0];
}
A a = new A(arr);
if (highMean < a.getMean()) {
highMean = a.getMean();
heterCluster = arr;
}
;
}
List<String> propList = new ArrayList<String>();
Iterator it = mapTriee.entrySet().iterator();
while (it.hasNext()) {
Entry<String, String> entry = (Entry<String, String>) it.next();
boolean f = false;
for (int i = 0; i < heterCluster.length; i++) {
if (String.valueOf(heterCluster[i]).equals(entry.getValue()))
propList.add(entry.getKey());
;
}
}
System.out.println(propList);
srcModel = ModelManager.rewrite(srcModel, true);
System.out.println("source");
tgtModel = ModelManager.rewrite(tgtModel, true);
Model srcFinalModel = ModelFactory.createDefaultModel();
srcModel.listStatements().toSet().forEach((stmt) -> {
Property property = stmt.getPredicate();
if (!(propList.contains(property.toString()))) {
srcFinalModel.add(stmt);
}
});
Model tgtFinalModel = ModelFactory.createDefaultModel();
tgtModel.listStatements().toSet().forEach((stmt) -> {
Property property = stmt.getPredicate();
if (!propList.contains(property.toString())) {
tgtFinalModel.add(stmt);
}
});
// FileManager.createRDFile(new File(legato.getPath()+"store"), "source", srcFinalModel, "TTL");
// FileManager.createRDFile(new File(legato.getPath()+"store"), "target", tgtFinalModel, "TTL");
legato.setSource(FileManager.getCreatedRDFile("source", srcFinalModel));
legato.setTarget(FileManager.getCreatedRDFile("target", tgtFinalModel));
System.out.println("finish");
}
use of legato.LEGATO in project legato by DOREMUS-ANR.
the class DocumentBuilder method getDocuments.
/**
**********************************************************
* Build documents for resources based on selected properties
***********************************************************
*/
public static HashMap<String, String> getDocuments(String pathFile, List<String> classResources, List<String> selectedProp, String dataset) throws Exception {
LEGATO legato = LEGATO.getInstance();
/**
**
* Load RDF model from the dataset
***
*/
File f = new File(pathFile);
Model modelSource = ModelManager.loadModel(pathFile);
// 1st String = the docName. 2d String = its content
HashMap<String, String> documents = new HashMap<String, String>();
/**
**
* Documents creation based on the selected properties for each resource
***
*/
for (Resource resource : CBDBuilder.getResources(modelSource, classResources)) {
Model model = ModelFactory.createDefaultModel();
String sparqlQueryString = "SELECT DISTINCT ?p ?o {<" + resource + "> ?p ?o }";
Query query = QueryFactory.create(sparqlQueryString);
QueryExecution qexec = QueryExecutionFactory.create(query, modelSource);
ResultSet queryResults = qexec.execSelect();
while (queryResults.hasNext()) {
QuerySolution qs = queryResults.nextSolution();
Resource prop = qs.getResource("?p");
if (selectedProp.contains(prop.toString())) {
model.createResource(resource).addProperty(model.createProperty(prop.toString()), qs.get("?o").toString());
}
}
qexec.close();
String docName = generateUUID(resource.getURI());
/**
***
* Preprocessing before documents creation
****
*/
String docContent = StopWords.clean(CBDBuilder.getLiterals(model));
// docContent = Stemmer.stem(docContent);
if (!docContent.equals("") && !docContent.equals(null) && !docContent.equals("\n") && !docContent.equals(" ")) {
if (dataset.equals("source"))
legato.setSrcUri(docName, resource.getURI());
else if (dataset.equals("target"))
legato.setTgtUri(docName, resource.getURI());
// Construct a document for each resource
documents.put(docName, docContent);
FileManager.create(docName, docContent, dataset);
}
}
return documents;
}
use of legato.LEGATO in project legato by DOREMUS-ANR.
the class FileManager method createRDFile.
/**
*******************
* Create an RDF file
********************
*/
public static void createRDFile(File dirCluster, String fileName, Model model, String ext) throws IOException {
LEGATO legato = LEGATO.getInstance();
FileWriter out = new FileWriter(dirCluster.getAbsolutePath() + File.separator + fileName + "." + ext);
try {
if (ext.equals("nt"))
model.write(out, "N-TRIPLES");
else
model.write(out, "TTL");
} finally {
try {
out.close();
} catch (IOException closeException) {
}
}
}
Aggregations