Search in sources :

Example 21 with Document

use of de.tblsoft.solr.pipeline.bean.Document in project solr-cmd-utils by tblsoft.

the class WhitelistTopicTermsFilter method init.

@Override
public void init() {
    fieldTopic = getProperty("fieldTopic", null);
    fieldValue = getProperty("fieldValue", null);
    override = getPropertyAsBoolean("override", true);
    arrayDelimiter = getProperty("arrayDelimiter", ";");
    topicValues = new HashMap<String, HashMap<String, Document>>();
    topicsOverriden = new HashMap<String, HashMap<String, Boolean>>();
    InputStream in = null;
    try {
        String filename = getProperty("filename", null);
        String absoluteFilename = IOUtils.getAbsoluteFile(getBaseDir(), filename);
        in = IOUtils.getInputStream(absoluteFilename);
        java.io.Reader reader = new InputStreamReader(in, StandardCharsets.UTF_8.name());
        CSVFormat format = CSVFormat.RFC4180.withHeader().withDelimiter(',');
        CSVParser parser = format.parse(reader);
        Iterator<CSVRecord> csvIterator = parser.iterator();
        while (csvIterator.hasNext()) {
            CSVRecord record = csvIterator.next();
            Map<String, Integer> header = parser.getHeaderMap();
            Document document = new Document();
            for (Map.Entry<String, Integer> entry : header.entrySet()) {
                String key = entry.getKey();
                try {
                    String[] values = record.get(key).split(arrayDelimiter);
                    document.addField(key, Arrays.asList(values));
                } catch (IllegalArgumentException ignored) {
                }
            }
            String topic = record.get(header.get(fieldTopic));
            String value = record.get(header.get(fieldValue));
            if (!topicValues.containsKey(topic)) {
                topicValues.put(topic, new HashMap<String, Document>());
                topicsOverriden.put(topic, new HashMap<String, Boolean>());
            }
            topicValues.get(topic).put(value, document);
            topicsOverriden.get(topic).put(value, false);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        if (in != null) {
            try {
                in.close();
            } catch (IOException ignored) {
            }
        }
    }
    super.init();
}
Also used : Document(de.tblsoft.solr.pipeline.bean.Document) InputStreamReader(java.io.InputStreamReader) InputStream(java.io.InputStream) IOException(java.io.IOException) IOException(java.io.IOException) CSVParser(org.apache.commons.csv.CSVParser) CSVFormat(org.apache.commons.csv.CSVFormat) CSVRecord(org.apache.commons.csv.CSVRecord)

Example 22 with Document

use of de.tblsoft.solr.pipeline.bean.Document in project solr-cmd-utils by tblsoft.

the class WhitelistTopicTermsFilter method document.

@Override
public void document(Document document) {
    Field topic = document.getField(fieldTopic);
    Field value = document.getField(fieldValue);
    boolean hasOverriden = false;
    if (override) {
        if (topic != null && value != null) {
            if (topicValues.containsKey(topic.getValue()) && topicValues.get(topic.getValue()).containsKey(value.getValue())) {
                Document docOverride = topicValues.get(topic.getValue()).get(value.getValue());
                for (Field field : docOverride.getFields()) {
                    document.setField(field.getName(), field.getValues());
                }
                super.document(document);
                hasOverriden = true;
                topicsOverriden.get(topic.getValue()).put(value.getValue(), true);
            }
        }
    }
    if (!hasOverriden) {
        super.document(document);
    }
}
Also used : Field(de.tblsoft.solr.pipeline.bean.Field) Document(de.tblsoft.solr.pipeline.bean.Document)

Example 23 with Document

use of de.tblsoft.solr.pipeline.bean.Document in project solr-cmd-utils by tblsoft.

the class NounExtractorFilter method end.

@Override
public void end() {
    List<String> dictionaryList = new ArrayList<String>(dictionary);
    Collections.sort(dictionaryList);
    for (String value : dictionaryList) {
        Document document = new Document();
        document.addField("noun", value);
        super.document(document);
    }
    super.end();
}
Also used : Document(de.tblsoft.solr.pipeline.bean.Document)

Example 24 with Document

use of de.tblsoft.solr.pipeline.bean.Document in project solr-cmd-utils by tblsoft.

the class CompoundWordFilter method end.

@Override
public void end() {
    nounList.addAll(whitelist);
    for (String noun : nounList) {
        List<String> compoundList = new ArrayList<String>();
        for (String compound : nounList) {
            if (noun.contains(compound) && !noun.equals(compound)) {
                int diff = Math.abs(compound.length() - noun.length());
                if (diff > 3) {
                    compoundList.add(compound);
                }
            }
        }
        if (!compoundList.isEmpty()) {
            Document document = new Document();
            document.addField("noun", noun);
            for (String compound : compoundList) {
                document.addField("compound", compound);
            // if(!isOverlap(compound, compoundList)) {
            // document.addField("compound", compound);
            // }
            }
            List<String> tokens = tokenize(noun, compoundList);
            List<String> additionalTokens = new ArrayList<String>();
            // for(String token:tokens) {
            // additionalTokens.addAll(tokenize(token, compoundList));
            // }
            tokens.addAll(additionalTokens);
            String joinedTokens = Joiner.on(" ").join(tokens);
            document.addField("tokenized", joinedTokens);
            super.document(document);
        }
    }
    super.end();
}
Also used : ArrayList(java.util.ArrayList) Document(de.tblsoft.solr.pipeline.bean.Document)

Example 25 with Document

use of de.tblsoft.solr.pipeline.bean.Document in project solr-cmd-utils by tblsoft.

the class SolrFieldCounter method end.

@Override
public void end() {
    Document document = new Document();
    Map<String, Long> fieldMap = solrFields.asMap();
    for (Map.Entry<String, Long> field : fieldMap.entrySet()) {
        document.addField(field.getKey(), String.valueOf(field.getValue()));
    }
    super.document(document);
    super.end();
}
Also used : Document(de.tblsoft.solr.pipeline.bean.Document) Map(java.util.Map) AtomicLongMap(com.google.common.util.concurrent.AtomicLongMap)

Aggregations

Document (de.tblsoft.solr.pipeline.bean.Document)51 ArrayList (java.util.ArrayList)9 Map (java.util.Map)9 Test (org.junit.Test)9 Field (de.tblsoft.solr.pipeline.bean.Field)8 DocumentDiff (de.tblsoft.solr.pipeline.bean.DocumentDiff)4 AbstractFilterTest (de.tblsoft.solr.pipeline.test.AbstractFilterTest)4 GsonBuilder (com.google.gson.GsonBuilder)3 Match (oi.thekraken.grok.api.Match)3 AtomicLongMap (com.google.common.util.concurrent.AtomicLongMap)2 Gson (com.google.gson.Gson)2 JsonElement (com.google.gson.JsonElement)2 DocumentContext (com.jayway.jsonpath.DocumentContext)2 PathNotFoundException (com.jayway.jsonpath.PathNotFoundException)2 DocumentBuilder (de.tblsoft.solr.pipeline.bean.DocumentBuilder)2 SimpleMapping (de.tblsoft.solr.pipeline.filter.SimpleMapping)2 File (java.io.File)2 IOException (java.io.IOException)2 InputStream (java.io.InputStream)2 InputStreamReader (java.io.InputStreamReader)2