Search in sources :

Example 1 with DuplicateDataConfig

use of org.apache.rya.indexing.smarturi.duplication.conf.DuplicateDataConfig in project incubator-rya by apache.

the class DuplicateDataDetectorIT method testCreateEntityNearDuplicateConfigDisabled.

@Test
public void testCreateEntityNearDuplicateConfigDisabled() throws EntityStorageException, TypeStorageException, ConfigurationException, ObjectStorageException {
    // Create the types the Entity uses.
    final TypeStorage typeStorage = new MongoTypeStorage(super.getMongoClient(), RYA_INSTANCE_NAME);
    final Type personType = createPersonType();
    final Type employeeType = createEmployeeType();
    typeStorage.create(personType);
    typeStorage.create(employeeType);
    final Optional<Type> storedPersonType = typeStorage.get(personType.getId());
    final Optional<Type> storedEmployeeType = typeStorage.get(employeeType.getId());
    assertTrue(storedPersonType.isPresent());
    assertTrue(storedEmployeeType.isPresent());
    // Create it.
    final DuplicateDataConfig duplicateDataConfig = new DuplicateDataConfig(// boolean
    new Tolerance(0.0, ToleranceType.DIFFERENCE), // byte
    new Tolerance(0.0, ToleranceType.DIFFERENCE), // date
    new Tolerance(500.0, ToleranceType.DIFFERENCE), // double
    new Tolerance(0.0001, ToleranceType.PERCENTAGE), // float
    new Tolerance(0.0001, ToleranceType.PERCENTAGE), // integer
    new Tolerance(1.0, ToleranceType.DIFFERENCE), // long
    new Tolerance(1.0, ToleranceType.DIFFERENCE), // short
    new Tolerance(1.0, ToleranceType.DIFFERENCE), // string
    new Tolerance(1.0, ToleranceType.DIFFERENCE), // uri
    new Tolerance(1.0, ToleranceType.DIFFERENCE), new HashMap<String, List<String>>(), false);
    final DuplicateDataDetector duplicateDataDetector = new DuplicateDataDetector(duplicateDataConfig);
    final EntityStorage entityStorage = new MongoEntityStorage(super.getMongoClient(), RYA_INSTANCE_NAME, duplicateDataDetector);
    final Entity bobEntity = createBobEntity();
    entityStorage.create(bobEntity);
    assertTrue(entityStorage.get(bobEntity.getSubject()).isPresent());
    final Builder duplicateBobBuilder = Entity.builder(createBobEntity());
    duplicateBobBuilder.setSubject(createRyaUri("Robert"));
    // Modify a property for each type that is within tolerance
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_AGE, shortRyaType((short) 41)));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_WEIGHT, floatRyaType(250.76f)));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_HEIGHT, doubleRyaType(72.499)));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_INCOME, intRyaType(50001)));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_NUMBER_OF_CHILDREN, byteRyaType((byte) 2)));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_LICENSE_NUMBER, longRyaType(123456789013L)));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_DATE_OF_BIRTH, dateRyaType(new DateTime(NOW.getTime() - 1).minusYears(40))));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_EXPIRATION_DATE, dateRyaType(new Date(NOW.getTime() - 1))));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_GLASSES, booleanRyaType(true)));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_EMAIL_ADDRESS, uriRyaType(new URIImpl("mailto:bob.smitch01@gmail.com"))));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_ADDRESS, stringRyaType("124 Fake St. Washington, DC 20024")));
    duplicateBobBuilder.setProperty(EMPLOYEE_TYPE_URI, new Property(HAS_EXTENSION, shortRyaType((short) 556)));
    final Entity duplicateBobEntity = duplicateBobBuilder.build();
    // Data duplication detection is disabled so it will be created.
    try {
        entityStorage.create(duplicateBobEntity);
    } catch (final EntityNearDuplicateException e) {
        fail();
    }
    assertTrue(entityStorage.get(duplicateBobEntity.getSubject()).isPresent());
    final Builder notDuplicateBobBuilder = Entity.builder(createBobEntity());
    notDuplicateBobBuilder.setSubject(createRyaUri("Not Bob"));
    // Modify a property for each type that is within tolerance
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_AGE, shortRyaType((short) 50)));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_WEIGHT, floatRyaType(300.0f)));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_HEIGHT, doubleRyaType(100.0)));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_INCOME, intRyaType(60000)));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_NUMBER_OF_CHILDREN, byteRyaType((byte) 5)));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_LICENSE_NUMBER, longRyaType(9L)));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_DATE_OF_BIRTH, dateRyaType(new DateTime(NOW.getTime() - 10000000L).minusYears(40))));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_EXPIRATION_DATE, dateRyaType(new Date(NOW.getTime() - 10000000L))));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_GLASSES, booleanRyaType(false)));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_EMAIL_ADDRESS, uriRyaType(new URIImpl("mailto:bad.email.address@gmail.com"))));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_ADDRESS, stringRyaType("123456789 Fake St. Washington, DC 20024")));
    notDuplicateBobBuilder.setProperty(EMPLOYEE_TYPE_URI, new Property(HAS_EXTENSION, shortRyaType((short) 1000)));
    final Entity notDuplicateBobEntity = notDuplicateBobBuilder.build();
    // Data duplication detection is disabled so it will be created.
    try {
        entityStorage.create(notDuplicateBobEntity);
    } catch (final EntityNearDuplicateException e) {
        fail();
    }
    assertTrue(entityStorage.get(notDuplicateBobEntity.getSubject()).isPresent());
}
Also used : Entity(org.apache.rya.indexing.entity.model.Entity) MongoEntityStorage(org.apache.rya.indexing.entity.storage.mongo.MongoEntityStorage) EntityStorage(org.apache.rya.indexing.entity.storage.EntityStorage) MongoEntityStorage(org.apache.rya.indexing.entity.storage.mongo.MongoEntityStorage) ReflectionToStringBuilder(org.apache.commons.lang.builder.ReflectionToStringBuilder) Builder(org.apache.rya.indexing.entity.model.Entity.Builder) URIImpl(org.openrdf.model.impl.URIImpl) DateTime(org.joda.time.DateTime) Date(java.util.Date) MongoTypeStorage(org.apache.rya.indexing.entity.storage.mongo.MongoTypeStorage) TypeStorage(org.apache.rya.indexing.entity.storage.TypeStorage) MongoTypeStorage(org.apache.rya.indexing.entity.storage.mongo.MongoTypeStorage) RyaType(org.apache.rya.api.domain.RyaType) RyaTypeUtils.shortRyaType(org.apache.rya.api.domain.RyaTypeUtils.shortRyaType) RyaTypeUtils.floatRyaType(org.apache.rya.api.domain.RyaTypeUtils.floatRyaType) RyaTypeUtils.uriRyaType(org.apache.rya.api.domain.RyaTypeUtils.uriRyaType) RyaTypeUtils.longRyaType(org.apache.rya.api.domain.RyaTypeUtils.longRyaType) RyaTypeUtils.stringRyaType(org.apache.rya.api.domain.RyaTypeUtils.stringRyaType) RyaTypeUtils.doubleRyaType(org.apache.rya.api.domain.RyaTypeUtils.doubleRyaType) RyaTypeUtils.byteRyaType(org.apache.rya.api.domain.RyaTypeUtils.byteRyaType) RyaTypeUtils.booleanRyaType(org.apache.rya.api.domain.RyaTypeUtils.booleanRyaType) RyaTypeUtils.dateRyaType(org.apache.rya.api.domain.RyaTypeUtils.dateRyaType) Type(org.apache.rya.indexing.entity.model.Type) RyaTypeUtils.intRyaType(org.apache.rya.api.domain.RyaTypeUtils.intRyaType) DuplicateDataConfig(org.apache.rya.indexing.smarturi.duplication.conf.DuplicateDataConfig) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Property(org.apache.rya.indexing.entity.model.Property) Test(org.junit.Test)

Example 2 with DuplicateDataConfig

use of org.apache.rya.indexing.smarturi.duplication.conf.DuplicateDataConfig in project incubator-rya by apache.

the class DuplicateDataDetectorIT method testCreateEntityNearDuplicate.

@Test
public void testCreateEntityNearDuplicate() throws EntityStorageException, TypeStorageException, ObjectStorageException {
    // Create the types the Entity uses.
    final TypeStorage typeStorage = new MongoTypeStorage(super.getMongoClient(), RYA_INSTANCE_NAME);
    final Type personType = createPersonType();
    final Type employeeType = createEmployeeType();
    typeStorage.create(personType);
    typeStorage.create(employeeType);
    final Optional<Type> storedPersonType = typeStorage.get(personType.getId());
    final Optional<Type> storedEmployeeType = typeStorage.get(employeeType.getId());
    assertTrue(storedPersonType.isPresent());
    assertTrue(storedEmployeeType.isPresent());
    // Create it.
    final DuplicateDataConfig duplicateDataConfig = new DuplicateDataConfig(// boolean
    new Tolerance(0.0, ToleranceType.DIFFERENCE), // byte
    new Tolerance(0.0, ToleranceType.DIFFERENCE), // date
    new Tolerance(500.0, ToleranceType.DIFFERENCE), // double
    new Tolerance(0.0001, ToleranceType.PERCENTAGE), // float
    new Tolerance(0.0001, ToleranceType.PERCENTAGE), // integer
    new Tolerance(1.0, ToleranceType.DIFFERENCE), // long
    new Tolerance(1.0, ToleranceType.DIFFERENCE), // short
    new Tolerance(1.0, ToleranceType.DIFFERENCE), // string
    new Tolerance(1.0, ToleranceType.DIFFERENCE), // uri
    new Tolerance(1.0, ToleranceType.DIFFERENCE), new HashMap<String, List<String>>(), true);
    final DuplicateDataDetector duplicateDataDetector = new DuplicateDataDetector(duplicateDataConfig);
    final EntityStorage entityStorage = new MongoEntityStorage(super.getMongoClient(), RYA_INSTANCE_NAME, duplicateDataDetector);
    final Entity bobEntity = createBobEntity();
    entityStorage.create(bobEntity);
    assertTrue(entityStorage.get(bobEntity.getSubject()).isPresent());
    final Builder duplicateBobBuilder = Entity.builder(createBobEntity());
    duplicateBobBuilder.setSubject(createRyaUri("Robert"));
    // Modify a property for each type that is within tolerance
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_AGE, shortRyaType((short) 41)));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_WEIGHT, floatRyaType(250.76f)));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_HEIGHT, doubleRyaType(72.499)));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_INCOME, intRyaType(50001)));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_NUMBER_OF_CHILDREN, byteRyaType((byte) 2)));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_LICENSE_NUMBER, longRyaType(123456789013L)));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_DATE_OF_BIRTH, dateRyaType(new DateTime(NOW.getTime() - 1).minusYears(40))));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_EXPIRATION_DATE, dateRyaType(new Date(NOW.getTime() - 1))));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_GLASSES, booleanRyaType(true)));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_EMAIL_ADDRESS, uriRyaType(new URIImpl("mailto:bob.smitch01@gmail.com"))));
    duplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_ADDRESS, stringRyaType("124 Fake St. Washington, DC 20024")));
    duplicateBobBuilder.setProperty(EMPLOYEE_TYPE_URI, new Property(HAS_EXTENSION, shortRyaType((short) 556)));
    final Entity duplicateBobEntity = duplicateBobBuilder.build();
    // Try to create another entity that's considered a duplicate.
    // It will NOT be be created.
    boolean hasDuplicate = false;
    try {
        entityStorage.create(duplicateBobEntity);
    } catch (final EntityNearDuplicateException e) {
        hasDuplicate = true;
    }
    assertTrue(hasDuplicate);
    assertFalse(entityStorage.get(duplicateBobEntity.getSubject()).isPresent());
    final Builder notDuplicateBobBuilder = Entity.builder(createBobEntity());
    notDuplicateBobBuilder.setSubject(createRyaUri("Not Bob"));
    // Modify a property for each type that is within tolerance
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_AGE, shortRyaType((short) 50)));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_WEIGHT, floatRyaType(300.0f)));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_HEIGHT, doubleRyaType(100.0)));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_INCOME, intRyaType(60000)));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_NUMBER_OF_CHILDREN, byteRyaType((byte) 5)));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_LICENSE_NUMBER, longRyaType(9L)));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_DATE_OF_BIRTH, dateRyaType(new DateTime(NOW.getTime() - 10000000L).minusYears(40))));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_EXPIRATION_DATE, dateRyaType(new Date(NOW.getTime() - 10000000L))));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_GLASSES, booleanRyaType(false)));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_EMAIL_ADDRESS, uriRyaType(new URIImpl("mailto:bad.email.address@gmail.com"))));
    notDuplicateBobBuilder.setProperty(PERSON_TYPE_URI, new Property(HAS_ADDRESS, stringRyaType("123456789 Fake St. Washington, DC 20024")));
    notDuplicateBobBuilder.setProperty(EMPLOYEE_TYPE_URI, new Property(HAS_EXTENSION, shortRyaType((short) 1000)));
    final Entity notDuplicateBobEntity = notDuplicateBobBuilder.build();
    // It will be created.
    try {
        entityStorage.create(notDuplicateBobEntity);
    } catch (final EntityNearDuplicateException e) {
        fail();
    }
    assertTrue(entityStorage.get(notDuplicateBobEntity.getSubject()).isPresent());
}
Also used : Entity(org.apache.rya.indexing.entity.model.Entity) MongoEntityStorage(org.apache.rya.indexing.entity.storage.mongo.MongoEntityStorage) EntityStorage(org.apache.rya.indexing.entity.storage.EntityStorage) MongoEntityStorage(org.apache.rya.indexing.entity.storage.mongo.MongoEntityStorage) ReflectionToStringBuilder(org.apache.commons.lang.builder.ReflectionToStringBuilder) Builder(org.apache.rya.indexing.entity.model.Entity.Builder) URIImpl(org.openrdf.model.impl.URIImpl) DateTime(org.joda.time.DateTime) Date(java.util.Date) MongoTypeStorage(org.apache.rya.indexing.entity.storage.mongo.MongoTypeStorage) TypeStorage(org.apache.rya.indexing.entity.storage.TypeStorage) MongoTypeStorage(org.apache.rya.indexing.entity.storage.mongo.MongoTypeStorage) RyaType(org.apache.rya.api.domain.RyaType) RyaTypeUtils.shortRyaType(org.apache.rya.api.domain.RyaTypeUtils.shortRyaType) RyaTypeUtils.floatRyaType(org.apache.rya.api.domain.RyaTypeUtils.floatRyaType) RyaTypeUtils.uriRyaType(org.apache.rya.api.domain.RyaTypeUtils.uriRyaType) RyaTypeUtils.longRyaType(org.apache.rya.api.domain.RyaTypeUtils.longRyaType) RyaTypeUtils.stringRyaType(org.apache.rya.api.domain.RyaTypeUtils.stringRyaType) RyaTypeUtils.doubleRyaType(org.apache.rya.api.domain.RyaTypeUtils.doubleRyaType) RyaTypeUtils.byteRyaType(org.apache.rya.api.domain.RyaTypeUtils.byteRyaType) RyaTypeUtils.booleanRyaType(org.apache.rya.api.domain.RyaTypeUtils.booleanRyaType) RyaTypeUtils.dateRyaType(org.apache.rya.api.domain.RyaTypeUtils.dateRyaType) Type(org.apache.rya.indexing.entity.model.Type) RyaTypeUtils.intRyaType(org.apache.rya.api.domain.RyaTypeUtils.intRyaType) DuplicateDataConfig(org.apache.rya.indexing.smarturi.duplication.conf.DuplicateDataConfig) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Property(org.apache.rya.indexing.entity.model.Property) Test(org.junit.Test)

Example 3 with DuplicateDataConfig

use of org.apache.rya.indexing.smarturi.duplication.conf.DuplicateDataConfig in project incubator-rya by apache.

the class DuplicateDataDetectorIT method testReadConfigFile.

@Test
public void testReadConfigFile() throws SmartUriException, ConfigurationException {
    final DuplicateDataConfig duplicateDataConfig = new DuplicateDataConfig();
    assertNotNull(duplicateDataConfig.getBooleanTolerance());
    assertNotNull(duplicateDataConfig.getByteTolerance());
    assertNotNull(duplicateDataConfig.getDateTolerance());
    assertNotNull(duplicateDataConfig.getDoubleTolerance());
    assertNotNull(duplicateDataConfig.getFloatTolerance());
    assertNotNull(duplicateDataConfig.getIntegerTolerance());
    assertNotNull(duplicateDataConfig.getLongTolerance());
    assertNotNull(duplicateDataConfig.getShortTolerance());
    assertNotNull(duplicateDataConfig.getStringTolerance());
    assertNotNull(duplicateDataConfig.getUriTolerance());
    assertNotNull(duplicateDataConfig.getEquivalentTermsMap());
    assertNotNull(duplicateDataConfig.isDetectionEnabled());
}
Also used : DuplicateDataConfig(org.apache.rya.indexing.smarturi.duplication.conf.DuplicateDataConfig) Test(org.junit.Test)

Aggregations

DuplicateDataConfig (org.apache.rya.indexing.smarturi.duplication.conf.DuplicateDataConfig)3 Test (org.junit.Test)3 ImmutableList (com.google.common.collect.ImmutableList)2 Date (java.util.Date)2 List (java.util.List)2 ReflectionToStringBuilder (org.apache.commons.lang.builder.ReflectionToStringBuilder)2 RyaType (org.apache.rya.api.domain.RyaType)2 RyaTypeUtils.booleanRyaType (org.apache.rya.api.domain.RyaTypeUtils.booleanRyaType)2 RyaTypeUtils.byteRyaType (org.apache.rya.api.domain.RyaTypeUtils.byteRyaType)2 RyaTypeUtils.dateRyaType (org.apache.rya.api.domain.RyaTypeUtils.dateRyaType)2 RyaTypeUtils.doubleRyaType (org.apache.rya.api.domain.RyaTypeUtils.doubleRyaType)2 RyaTypeUtils.floatRyaType (org.apache.rya.api.domain.RyaTypeUtils.floatRyaType)2 RyaTypeUtils.intRyaType (org.apache.rya.api.domain.RyaTypeUtils.intRyaType)2 RyaTypeUtils.longRyaType (org.apache.rya.api.domain.RyaTypeUtils.longRyaType)2 RyaTypeUtils.shortRyaType (org.apache.rya.api.domain.RyaTypeUtils.shortRyaType)2 RyaTypeUtils.stringRyaType (org.apache.rya.api.domain.RyaTypeUtils.stringRyaType)2 RyaTypeUtils.uriRyaType (org.apache.rya.api.domain.RyaTypeUtils.uriRyaType)2 Entity (org.apache.rya.indexing.entity.model.Entity)2 Builder (org.apache.rya.indexing.entity.model.Entity.Builder)2 Property (org.apache.rya.indexing.entity.model.Property)2