use of de.tblsoft.solr.pipeline.bean.Document in project solr-cmd-utils by tblsoft.
the class WhitelistTopicTermsFilter method init.
@Override
public void init() {
fieldTopic = getProperty("fieldTopic", null);
fieldValue = getProperty("fieldValue", null);
override = getPropertyAsBoolean("override", true);
arrayDelimiter = getProperty("arrayDelimiter", ";");
topicValues = new HashMap<String, HashMap<String, Document>>();
topicsOverriden = new HashMap<String, HashMap<String, Boolean>>();
InputStream in = null;
try {
String filename = getProperty("filename", null);
String absoluteFilename = IOUtils.getAbsoluteFile(getBaseDir(), filename);
in = IOUtils.getInputStream(absoluteFilename);
java.io.Reader reader = new InputStreamReader(in, StandardCharsets.UTF_8.name());
CSVFormat format = CSVFormat.RFC4180.withHeader().withDelimiter(',');
CSVParser parser = format.parse(reader);
Iterator<CSVRecord> csvIterator = parser.iterator();
while (csvIterator.hasNext()) {
CSVRecord record = csvIterator.next();
Map<String, Integer> header = parser.getHeaderMap();
Document document = new Document();
for (Map.Entry<String, Integer> entry : header.entrySet()) {
String key = entry.getKey();
try {
String[] values = record.get(key).split(arrayDelimiter);
document.addField(key, Arrays.asList(values));
} catch (IllegalArgumentException ignored) {
}
}
String topic = record.get(header.get(fieldTopic));
String value = record.get(header.get(fieldValue));
if (!topicValues.containsKey(topic)) {
topicValues.put(topic, new HashMap<String, Document>());
topicsOverriden.put(topic, new HashMap<String, Boolean>());
}
topicValues.get(topic).put(value, document);
topicsOverriden.get(topic).put(value, false);
}
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
if (in != null) {
try {
in.close();
} catch (IOException ignored) {
}
}
}
super.init();
}
use of de.tblsoft.solr.pipeline.bean.Document in project solr-cmd-utils by tblsoft.
the class WhitelistTopicTermsFilter method document.
@Override
public void document(Document document) {
Field topic = document.getField(fieldTopic);
Field value = document.getField(fieldValue);
boolean hasOverriden = false;
if (override) {
if (topic != null && value != null) {
if (topicValues.containsKey(topic.getValue()) && topicValues.get(topic.getValue()).containsKey(value.getValue())) {
Document docOverride = topicValues.get(topic.getValue()).get(value.getValue());
for (Field field : docOverride.getFields()) {
document.setField(field.getName(), field.getValues());
}
super.document(document);
hasOverriden = true;
topicsOverriden.get(topic.getValue()).put(value.getValue(), true);
}
}
}
if (!hasOverriden) {
super.document(document);
}
}
use of de.tblsoft.solr.pipeline.bean.Document in project solr-cmd-utils by tblsoft.
the class NounExtractorFilter method end.
@Override
public void end() {
List<String> dictionaryList = new ArrayList<String>(dictionary);
Collections.sort(dictionaryList);
for (String value : dictionaryList) {
Document document = new Document();
document.addField("noun", value);
super.document(document);
}
super.end();
}
use of de.tblsoft.solr.pipeline.bean.Document in project solr-cmd-utils by tblsoft.
the class CompoundWordFilter method end.
@Override
public void end() {
nounList.addAll(whitelist);
for (String noun : nounList) {
List<String> compoundList = new ArrayList<String>();
for (String compound : nounList) {
if (noun.contains(compound) && !noun.equals(compound)) {
int diff = Math.abs(compound.length() - noun.length());
if (diff > 3) {
compoundList.add(compound);
}
}
}
if (!compoundList.isEmpty()) {
Document document = new Document();
document.addField("noun", noun);
for (String compound : compoundList) {
document.addField("compound", compound);
// if(!isOverlap(compound, compoundList)) {
// document.addField("compound", compound);
// }
}
List<String> tokens = tokenize(noun, compoundList);
List<String> additionalTokens = new ArrayList<String>();
// for(String token:tokens) {
// additionalTokens.addAll(tokenize(token, compoundList));
// }
tokens.addAll(additionalTokens);
String joinedTokens = Joiner.on(" ").join(tokens);
document.addField("tokenized", joinedTokens);
super.document(document);
}
}
super.end();
}
use of de.tblsoft.solr.pipeline.bean.Document in project solr-cmd-utils by tblsoft.
the class SolrFieldCounter method end.
@Override
public void end() {
Document document = new Document();
Map<String, Long> fieldMap = solrFields.asMap();
for (Map.Entry<String, Long> field : fieldMap.entrySet()) {
document.addField(field.getKey(), String.valueOf(field.getValue()));
}
super.document(document);
super.end();
}
Aggregations