use of org.tartarus.snowball.ext.PorterStemmer in project step by STEPBible.
the class SubjectSuggestionServiceImpl method collectNonExactMatches.
@Override
public SubjectSuggestion[] collectNonExactMatches(final TermsAndMaxCount<SubjectSuggestion> collector, final SuggestionContext context, final SubjectSuggestion[] alreadyRetrieved, final int leftToCollect) {
final Map<String, SubjectSuggestion> suggestions = new TreeMap<String, SubjectSuggestion>();
final PorterStemmer stemmer = new PorterStemmer();
addExistingMappings(suggestions, stemmer, alreadyRetrieved);
final String input = context.getInput();
final TermsAndMaxCount termsFromHeadings = LuceneUtils.getAllTermsPrefixedWith(false, false, this.jSwordSearchService.getIndexSearcher(JSwordPassageService.REFERENCE_BOOK), LuceneIndex.FIELD_HEADING, input, leftToCollect);
final TermsAndMaxCount termsFromSimpleNave = this.naves.findSetOfTermsWithCounts(false, true, input, leftToCollect, "root");
final TermsAndMaxCount termsFromFullNave = this.naves.findSetOfTermsWithCounts(false, true, input, leftToCollect, "fullTerm");
addSubjectTerms(suggestions, stemmer, termsFromHeadings.getTerms(), SearchType.SUBJECT_SIMPLE);
addSubjectTerms(suggestions, stemmer, termsFromSimpleNave.getTerms(), SearchType.SUBJECT_EXTENDED);
addSubjectTerms(suggestions, stemmer, termsFromFullNave.getTerms(), SearchType.SUBJECT_FULL);
// termsFromHeadings.setTotalCount(termsFromHeadings.getTotalCount() - addSubjectTerms(suggestions, stemmer, termsFromHeadings.getTerms(), SearchType.SUBJECT_SIMPLE));
// termsFromSimpleNave.setTotalCount(termsFromSimpleNave.getTotalCount() - addSubjectTerms(suggestions, stemmer, termsFromSimpleNave.getTerms(), SearchType.SUBJECT_EXTENDED));
// termsFromFullNave.setTotalCount(termsFromFullNave.getTotalCount() - addSubjectTerms(suggestions, stemmer, termsFromFullNave.getTerms(), SearchType.SUBJECT_FULL));
TermsAndMaxCount countsAndResults = new TermsAndMaxCount();
countsAndResults.setTerms(new HashSet<SubjectSuggestion>(suggestions.values()));
collector.setTotalCount(suggestions.size());
return suggestions.values().toArray(new SubjectSuggestion[countsAndResults.getTerms().size()]);
}
use of org.tartarus.snowball.ext.PorterStemmer in project Dawrly by ZeyadTarekk.
the class ProcessString method stemming.
protected static List<String> stemming(List<String> words) {
PorterStemmer stemmer = new PorterStemmer();
// stem words in the list
for (int i = 0; i < words.size(); i++) {
// set string you need to stem
stemmer.setCurrent(words.get(i));
// stem the word
stemmer.stem();
// get the stemmed word
words.set(i, stemmer.getCurrent());
}
return words;
}
use of org.tartarus.snowball.ext.PorterStemmer in project Dawrly by ZeyadTarekk.
the class Ranker method stemTheWord.
// HashMap<String, HashMap<String, Pair<Integer, Integer,Double>>>
// Words page tf size score
private String stemTheWord(String word) {
PorterStemmer stemmer = new PorterStemmer();
stemmer.setCurrent(word);
stemmer.stem();
//
return stemmer.getCurrent();
}
use of org.tartarus.snowball.ext.PorterStemmer in project Dawrly by ZeyadTarekk.
the class Indexer method filterTags.
private static synchronized void filterTags(org.jsoup.nodes.Document html, String fileName) throws IOException {
PorterStemmer stemmer = new PorterStemmer();
Pattern pattern = Pattern.compile("\\w+");
Matcher matcher;
HashMap<String, Double> tempScore = new HashMap<>();
// filtration most important tags
for (String line : tagsOfHtml.keySet()) {
String taggedString = html.select(line).text();
if (html != null && !taggedString.isEmpty()) {
matcher = pattern.matcher(taggedString.toLowerCase());
while (matcher.find()) {
stemmer.setCurrent(matcher.group());
stemmer.stem();
taggedString = stemmer.getCurrent();
if (!tempScore.containsKey(taggedString))
tempScore.put(taggedString, tagsOfHtml.get(line));
else
tempScore.put(taggedString, tempScore.get(taggedString) + tagsOfHtml.get(line));
}
}
}
scoreOfWords.put(fileName, tempScore);
}
use of org.tartarus.snowball.ext.PorterStemmer in project Dawrly by ZeyadTarekk.
the class Indexer method getIndexOfWord.
// get indices of each word in each Document
private static synchronized void getIndexOfWord(List<String> splitWord, StringBuilder originalDoc, String fileName) {
// TODO: matching actual string not substring in document
Integer lengthOfDoc = originalDoc.length();
PorterStemmer stemmer = new PorterStemmer();
HashMap<String, List<Integer>> tempIndex = new HashMap<>();
HashSet<Integer> list = new HashSet<>();
for (String word : splitWord) {
int startFrom = 0;
while (true) {
// get the occurrence of index of each word
int index = originalDoc.indexOf(word, startFrom);
char startChar = '.';
char endChar = '.';
if (index - 1 >= 0)
startChar = originalDoc.charAt(index - 1);
if ((index + word.length()) < lengthOfDoc) {
endChar = originalDoc.charAt(word.length() + index);
}
boolean beforeWord = Character.toString(startChar).matches(".*[a-zA-Z]+.*");
boolean afterWord = Character.toString(endChar).matches(".*[a-zA-Z]+.*");
if (index >= 0) {
if (!beforeWord && !afterWord)
list.add(index);
startFrom = index + word.length();
} else
break;
}
String lowerWord = word.toLowerCase();
stemmer.setCurrent(lowerWord);
stemmer.stem();
if (list.isEmpty())
// indices out of body
list.add(-2);
tempIndex.put(stemmer.getCurrent(), new ArrayList<>(list));
list.clear();
}
tempIndex.keySet().remove("");
indicesOfWord.put(fileName, tempIndex);
}
Aggregations