Search in sources :

Example 1 with LEGATO

use of legato.LEGATO in project legato by DOREMUS-ANR.

the class Matchifier method match.

public void match() throws Exception {
    LEGATO legato = LEGATO.getInstance();
    // Final links
    MapList mapList = new MapList();
    // Recall++
    MapList mapList1 = new MapList();
    // Precision++
    MapList mapList2 = new MapList();
     * Getting vectors from "Source" and "Target" datasets
    VectorGenerator vectorGenerator = new VectorGenerator();
    // List of "Source" and "Target" TF-IDF vectors
    DocVector[] docVectors = vectorGenerator.GetDocumentVectors();
    // List of "Source" TF-IDF vectors with their "docName"
    HashMap<String, double[]> srcMap = new HashMap<String, double[]>();
    // List of "Target" TF-IDF vectors with their "docName"
    HashMap<String, double[]> tgtMap = new HashMap<String, double[]>();
    for (// Identify "Source" and "Target" vectors
    DocVector doc : // Identify "Source" and "Target" vectors
    docVectors) {
        double[] vector = doc.getVector();
        if (doc.parentFolder.equals("source"))
            srcMap.put(doc.docName, vector);
        else if (doc.parentFolder.equals("target"))
            tgtMap.put(doc.docName, vector);
     * "Hierarchical Clustering" on "Source" and "Target" datasets.
     * For each pair --> apply RANkey and link instances based on the
     * best key.
    // List of "Source" clusters
    ClusterList srcClS = Clustering.getClusters(srcMap);
    // List of "target" clusters
    ClusterList tgtClS = Clustering.getClusters(tgtMap);
     * RANKey
    int pairNumber = 0;
    // All pairs of clusters will be contained in folder "clusters"
    File dir = new File(legato.getPath() + File.separator + "clusters");
    DistanceFunction distanceFunction = new DistanceCorrelation();
    for (// For each cluster from "Source" dataset
    Cluster clust1 : // For each cluster from "Source" dataset
    srcClS) {
        for (// For each cluster from "Target" dataset
        Cluster clust2 : // For each cluster from "Target" dataset
        tgtClS) {
            DoubleArray cs = new DoubleArray(clust1.getCentroid().elements);
            DoubleArray ct = new DoubleArray(clust2.getCentroid().elements);
            if (CosineSimilarity.cosineSimilarity(clust1.getCentroid(), clust2.getCentroid()) > 0.4) {
                if (// If both clusters contain more than one instance
                clust1.size() > 1 && clust2.size() > 1) {
                    pairNumber = pairNumber + 1;
                    // Both clusters will be contained in folder "pairNumber"
                    File dirClusters = new File(dir.getAbsolutePath() + File.separator + pairNumber);
                     * Retrieve RDF Model from the 2 clusters
                    Model srcModel = ModelManager.loadModel(legato.src.toString());
                    Model model1 = ModelFactory.createDefaultModel();
                    String[] resources = clust1.getIDs().split("\n");
                    for (String rsrce : resources) {
                        String uri = legato.getSrcURIs().get(rsrce);
                        Resource resource = ResourceFactory.createResource(uri);
                        model1.add(CBDBuilder.getCBD(srcModel, resource));
                    Model tgtModel = ModelManager.loadModel(legato.tgt.toString());
                    Model model2 = ModelFactory.createDefaultModel();
                    resources = clust2.getIDs().split("\n");
                    for (String rsrce : resources) {
                        String uri = legato.getTgtURIs().get(rsrce);
                        Resource resource = ResourceFactory.createResource(uri);
                        model2.add(CBDBuilder.getCBD(tgtModel, resource));
                     * Execute RANKey
                    HashSet<String> bestKey = KeysClassifier.getBestKey(model1, model2, dirClusters);
                    if (!(bestKey == null)) {
                         * Execute SILK
                        SilkConfig.config(bestKey, dirClusters, dirClusters.toString() + File.separator + "source.nt", dirClusters.toString() + File.separator + "target.nt");
                        File file = new File(dirClusters.toString() + File.separator + "links.rdf");
                        AlignmentParser aparser = new AlignmentParser(0);
                        Alignment links = aparser.parse(file.toURI());
                        for (Cell cell : links) {
                            mapList2.add(cell.getObject1AsURI().toString(), cell.getObject2AsURI().toString(), cell.getStrength());
                } else if (clust1.size() == 1 && clust2.size() == 1) {
                // mapList2.add(""+clust1.getExemplar().getID(), ""+clust2.getExemplar().getID(), CosineSimilarity.cosineSimilarity(clust1.getCentroid(), clust2.getCentroid()));
     * Comparison
    for (int i = 0; i < docVectors.length; i++) {
        DocVector srcDoc = docVectors[i];
        String tgtDoc = null;
        double simVal = 0;
        for (int j = 0; j < docVectors.length; j++) {
            if ((srcDoc.parentFolder.equals("source")) && (docVectors[j].parentFolder.equals("target"))) {
                if ((tgtDoc == null) || (CosineSimilarity.cosineSimilarity(srcDoc, docVectors[j]) > simVal)) {
                    tgtDoc = docVectors[j].docName;
                    simVal = CosineSimilarity.cosineSimilarity(srcDoc, docVectors[j]);
        if ((tgtDoc != null) && simVal >= legato.getThreshold()) {
            /*	Model srcModel = ModelManager.loadModel(legato.src.toString());
	    		Model model1 = ModelFactory.createDefaultModel();
	    		Resource rsrce1 = model1.createResource(legato.getSrcURIs().get(srcDoc.docName));
	    		String str1 = legato.getType(rsrce1, srcModel).toString();
	    		Model tgtModel = ModelManager.loadModel(legato.tgt.toString());
	    		Model model2 = ModelFactory.createDefaultModel();
	    		Resource rsrce2 = model2.createResource(legato.getTgtURIs().get(tgtDoc));
	    		String str2 = legato.getType(rsrce2, tgtModel).toString();
	    		if (str1.equals(str2)) */
            mapList1.add(legato.getSrcURIs().get(srcDoc.docName), legato.getTgtURIs().get(tgtDoc), simVal);
     * Link repairing
    for (Map map1 : mapList1) {
        boolean exist = false;
        for (Map map2 : mapList2) {
            if (map1.getSourceURI().equals(map2.getSourceURI())) {
                if (map1.getTargetURI().equals(map2.getTargetURI()))
                exist = true;
        if (exist == false)
    for (Map map2 : mapList2) {
        boolean exist = false;
        for (Map map1 : mapList1) {
            if (map2.getSourceURI().equals(map1.getSourceURI())) {
                exist = true;
        if (exist == false) {
     ** Create and save the alignment file
    File dirr = new File(legato.getPath() + File.separator + "docs");
    File dirind = new File(legato.getPath() + File.separator + "index");
    File srcFile = new File(legato.getPath() + File.separator + "source.rdf");
    File tgtFile = new File(legato.getPath() + File.separator + "target.rdf");
    File txtFile = new File(legato.getPath() + File.separator + "nom.txt");
Also used : ClusterList(legato.cluster.ClusterList) HashMap(java.util.HashMap) Alignment(org.semanticweb.owl.align.Alignment) LEGATO(legato.LEGATO) Cell(org.semanticweb.owl.align.Cell) Resource(org.apache.jena.rdf.model.Resource) Cluster(legato.cluster.Cluster) DocVector(legato.indexer.DocVector) AlignmentParser(fr.inrialpes.exmo.align.parser.AlignmentParser) DistanceFunction(ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceFunction) Model(org.apache.jena.rdf.model.Model) VectorGenerator(legato.indexer.VectorGenerator) File( DistanceCorrelation(ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceCorrelation) DoubleArray(ca.pfv.spmf.patterns.cluster.DoubleArray) HashMap(java.util.HashMap)

Example 2 with LEGATO

use of legato.LEGATO in project legato by DOREMUS-ANR.

the class ModelManager method rewrite.

 ** Place all Literals (in resources CBD) to a distance = 1
public static Model rewrite(Model model, boolean ok) throws IOException {
    LEGATO legato = LEGATO.getInstance();
    Model finalModel = ModelFactory.createDefaultModel();
    model.listSubjects().toSet().forEach((resource) -> {
        // Parse all resources
        if (// If the current resource belongs to a given "type"
        legato.hasType(resource) == true) {
            Model m = CBDBuilder.getCBD(model, resource);
            if (ok == true) {
                m.add(CBDBuilder.getCBDDirectPredecessors(model, resource));
                m.add(CBDBuilder.getCBDDirectSuccessors(model, resource));
            try {
            } catch (IOException e1) {
            m.listStatements().toSet().forEach((stmt) -> {
                Resource sub = stmt.getSubject();
                Property prop = stmt.getPredicate();
                RDFNode object = stmt.getObject();
                if (// Parse all literals
                object.isLiteral() == true) {
                    // A filter which accepts statements whose predicate matches one of a collection of predicates held by the filter object.
                    Path path = OntTools.findShortestPath(m, resource, object, Filter.any);
                    if (!(path == null)) {
                        // Get the successive properties from the path
                        List<Property> properties = getPropFromPath(path);
                        if (legato.getPropList().existProperty(properties) == false) {
                            int indice = legato.getPropList().size();
                            finalModel.createResource(resource.toString()).addProperty(finalModel.createProperty("" + indice), object);
                            try {
                                legato.addToPropList("" + indice, properties);
                            } catch (IOException e) {
                        } else {
                            finalModel.createResource(resource.toString()).addProperty(finalModel.createProperty(legato.getPropList().getPropertyName(properties)), object);
                    } else {
                        String sparqlQueryString = "select ?predec where {" + "?predec ?prop <" + resource + ">." + "}";
                        Query query = QueryFactory.create(sparqlQueryString);
                        QueryExecution qexec = QueryExecutionFactory.create(query, model);
                        ResultSet queryResults = qexec.execSelect();
                        while (queryResults.hasNext()) {
                            QuerySolution qs = queryResults.nextSolution();
                            final PathManager.Path path2 = PathManager.findShortestPath(model, qs.getResource("?predec"), object, prop);
                            if (!(path2 == null)) {
                                // Get the successive properties from the path
                                List<Property> properties = getPropFromPath(path2);
                                if (legato.getPropList().existProperty(properties) == false) {
                                    int indice = legato.getPropList().size();
                                    finalModel.createResource(resource.toString()).addProperty(finalModel.createProperty("" + indice), object);
                                    try {
                                        legato.addToPropList("" + indice, properties);
                                    } catch (IOException e) {
                                } else {
                                    finalModel.createResource(resource.toString()).addProperty(finalModel.createProperty(legato.getPropList().getPropertyName(properties)), object);
                } else if (prop.equals(RDF.type) && (legato.hasType(sub))) {
                    finalModel.createResource(resource.toString()).addProperty(RDF.type, object);
            // else
            // finalModel.createResource(resource.toString()).addProperty(prop, object);
    return finalModel;
Also used : Path(org.apache.jena.ontology.OntTools.Path) Query(org.apache.jena.query.Query) Resource(org.apache.jena.rdf.model.Resource) IOException( QueryExecution(org.apache.jena.query.QueryExecution) LEGATO(legato.LEGATO) QuerySolution(org.apache.jena.query.QuerySolution) OntModel(org.apache.jena.ontology.OntModel) Model(org.apache.jena.rdf.model.Model) ResultSet(org.apache.jena.query.ResultSet) Property(org.apache.jena.rdf.model.Property) RDFNode(org.apache.jena.rdf.model.RDFNode)

Example 3 with LEGATO

use of legato.LEGATO in project legato by DOREMUS-ANR.

the class PropertyHandler method clean.

 * This class deletes problematic properties
public static void clean(String srcPath, String tgtPath) throws IOException {
    LEGATO legato = LEGATO.getInstance();
    Model srcModel = ModelManager.loadModel(srcPath);
    Model tgtModel = ModelManager.loadModel(tgtPath);
    Model s = ModelFactory.createDefaultModel();
    Model t = ModelFactory.createDefaultModel();
    s = ModelManager.rewrite(srcModel, false);
    t = ModelManager.rewrite(tgtModel, false);
    Model mergedModel = ModelFactory.createDefaultModel();
    List<Resource> properties = getDistinctProperties(mergedModel);
    HashMap<String, String> propScoreList = new HashMap<String, String>();
    properties.forEach((property) -> {
        propScoreList.put(property.toString(), String.valueOf(getScore(property, mergedModel)));
    ValueComparator<String> comp = new ValueComparator<String>(propScoreList);
    TreeMap<String, String> mapTriee = new TreeMap<String, String>(comp);
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < mapTriee.entrySet().size(); i++) {
        sb.append(Double.valueOf((String) mapTriee.values().toArray()[i]) + "\n");
    FileManager.create("nom", sb.toString().trim());
    int minPts = 1;
    double epsilon = 5d;
    AlgoDBSCAN algo = new AlgoDBSCAN();
    List<Cluster> clusters = algo.runAlgorithm(legato.getPath() + File.separator + "nom.txt", minPts, epsilon, "\n");
    double highMean = 0;
    double[] heterCluster = null;
    for (Cluster cluster : clusters) {
        double[] arr = new double[cluster.getVectors().size()];
        int i = 0;
        for (DoubleArray dataPoint : cluster.getVectors()) {
            arr[i++] =[0];
        A a = new A(arr);
        if (highMean < a.getMean()) {
            highMean = a.getMean();
            heterCluster = arr;
    List<String> propList = new ArrayList<String>();
    Iterator it = mapTriee.entrySet().iterator();
    while (it.hasNext()) {
        Entry<String, String> entry = (Entry<String, String>);
        boolean f = false;
        for (int i = 0; i < heterCluster.length; i++) {
            if (String.valueOf(heterCluster[i]).equals(entry.getValue()))
    srcModel = ModelManager.rewrite(srcModel, true);
    tgtModel = ModelManager.rewrite(tgtModel, true);
    Model srcFinalModel = ModelFactory.createDefaultModel();
    srcModel.listStatements().toSet().forEach((stmt) -> {
        Property property = stmt.getPredicate();
        if (!(propList.contains(property.toString()))) {
    Model tgtFinalModel = ModelFactory.createDefaultModel();
    tgtModel.listStatements().toSet().forEach((stmt) -> {
        Property property = stmt.getPredicate();
        if (!propList.contains(property.toString())) {
    // FileManager.createRDFile(new File(legato.getPath()+"store"), "source", srcFinalModel, "TTL");
    // FileManager.createRDFile(new File(legato.getPath()+"store"), "target", tgtFinalModel, "TTL");
    legato.setSource(FileManager.getCreatedRDFile("source", srcFinalModel));
    legato.setTarget(FileManager.getCreatedRDFile("target", tgtFinalModel));
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Entry(java.util.Map.Entry) LEGATO(legato.LEGATO) Iterator(java.util.Iterator) Property(org.apache.jena.rdf.model.Property) Resource(org.apache.jena.rdf.model.Resource) Cluster(ca.pfv.spmf.patterns.cluster.Cluster) TreeMap(java.util.TreeMap) AlgoDBSCAN(ca.pfv.spmf.algorithms.clustering.dbscan.AlgoDBSCAN) Model(org.apache.jena.rdf.model.Model) DoubleArray(ca.pfv.spmf.patterns.cluster.DoubleArray)

Example 4 with LEGATO

use of legato.LEGATO in project legato by DOREMUS-ANR.

the class DocumentBuilder method getDocuments.

 * Build documents for resources based on selected properties
public static HashMap<String, String> getDocuments(String pathFile, List<String> classResources, List<String> selectedProp, String dataset) throws Exception {
    LEGATO legato = LEGATO.getInstance();
     * Load RDF model from the dataset
    File f = new File(pathFile);
    Model modelSource = ModelManager.loadModel(pathFile);
    // 1st String = the docName. 2d String = its content
    HashMap<String, String> documents = new HashMap<String, String>();
     * Documents creation based on the selected properties for each resource
    for (Resource resource : CBDBuilder.getResources(modelSource, classResources)) {
        Model model = ModelFactory.createDefaultModel();
        String sparqlQueryString = "SELECT DISTINCT ?p ?o {<" + resource + "> ?p ?o }";
        Query query = QueryFactory.create(sparqlQueryString);
        QueryExecution qexec = QueryExecutionFactory.create(query, modelSource);
        ResultSet queryResults = qexec.execSelect();
        while (queryResults.hasNext()) {
            QuerySolution qs = queryResults.nextSolution();
            Resource prop = qs.getResource("?p");
            if (selectedProp.contains(prop.toString())) {
                model.createResource(resource).addProperty(model.createProperty(prop.toString()), qs.get("?o").toString());
        String docName = generateUUID(resource.getURI());
         * Preprocessing before documents creation
        String docContent = StopWords.clean(CBDBuilder.getLiterals(model));
        // docContent = Stemmer.stem(docContent);
        if (!docContent.equals("") && !docContent.equals(null) && !docContent.equals("\n") && !docContent.equals(" ")) {
            if (dataset.equals("source"))
                legato.setSrcUri(docName, resource.getURI());
            else if (dataset.equals("target"))
                legato.setTgtUri(docName, resource.getURI());
            // Construct a document for each resource
            documents.put(docName, docContent);
            FileManager.create(docName, docContent, dataset);
    return documents;
Also used : LEGATO(legato.LEGATO) Query(org.apache.jena.query.Query) HashMap(java.util.HashMap) QuerySolution(org.apache.jena.query.QuerySolution) Model(org.apache.jena.rdf.model.Model) Resource(org.apache.jena.rdf.model.Resource) ResultSet(org.apache.jena.query.ResultSet) File( QueryExecution(org.apache.jena.query.QueryExecution)

Example 5 with LEGATO

use of legato.LEGATO in project legato by DOREMUS-ANR.

the class FileManager method createRDFile.

 * Create an RDF file
public static void createRDFile(File dirCluster, String fileName, Model model, String ext) throws IOException {
    LEGATO legato = LEGATO.getInstance();
    FileWriter out = new FileWriter(dirCluster.getAbsolutePath() + File.separator + fileName + "." + ext);
    try {
        if (ext.equals("nt"))
            model.write(out, "N-TRIPLES");
            model.write(out, "TTL");
    } finally {
        try {
        } catch (IOException closeException) {
Also used : LEGATO(legato.LEGATO) FileWriter( IOException(


LEGATO (legato.LEGATO)14 File ( Resource (org.apache.jena.rdf.model.Resource)7 Model (org.apache.jena.rdf.model.Model)6 IOException ( ArrayList (java.util.ArrayList)4 HashMap (java.util.HashMap)4 QueryExecution (org.apache.jena.query.QueryExecution)4 FileWriter ( Iterator (java.util.Iterator)3 Query (org.apache.jena.query.Query)3 QuerySolution (org.apache.jena.query.QuerySolution)3 ResultSet (org.apache.jena.query.ResultSet)3 Property (org.apache.jena.rdf.model.Property)3 DoubleArray (ca.pfv.spmf.patterns.cluster.DoubleArray)2 AlignmentParser (fr.inrialpes.exmo.align.parser.AlignmentParser)2 BufferedReader ( InputStreamReader ( TreeMap (java.util.TreeMap)2 Key (legato.keys.def.Key)2