Search in sources :

Example 81 with CharsetDecoder

use of java.nio.charset.CharsetDecoder in project lucene-solr by apache.

the class TestCharBlockArray method testArray.

@Test
public void testArray() throws Exception {
    CharBlockArray array = new CharBlockArray();
    StringBuilder builder = new StringBuilder();
    final int n = 100 * 1000;
    byte[] buffer = new byte[50];
    for (int i = 0; i < n; i++) {
        random().nextBytes(buffer);
        int size = 1 + random().nextInt(50);
        // This test is turning random bytes into a string,
        // this is asking for trouble.
        CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onUnmappableCharacter(CodingErrorAction.REPLACE).onMalformedInput(CodingErrorAction.REPLACE);
        String s = decoder.decode(ByteBuffer.wrap(buffer, 0, size)).toString();
        array.append(s);
        builder.append(s);
    }
    for (int i = 0; i < n; i++) {
        random().nextBytes(buffer);
        int size = 1 + random().nextInt(50);
        // This test is turning random bytes into a string,
        // this is asking for trouble.
        CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onUnmappableCharacter(CodingErrorAction.REPLACE).onMalformedInput(CodingErrorAction.REPLACE);
        String s = decoder.decode(ByteBuffer.wrap(buffer, 0, size)).toString();
        array.append((CharSequence) s);
        builder.append(s);
    }
    for (int i = 0; i < n; i++) {
        random().nextBytes(buffer);
        int size = 1 + random().nextInt(50);
        // This test is turning random bytes into a string,
        // this is asking for trouble.
        CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onUnmappableCharacter(CodingErrorAction.REPLACE).onMalformedInput(CodingErrorAction.REPLACE);
        String s = decoder.decode(ByteBuffer.wrap(buffer, 0, size)).toString();
        for (int j = 0; j < s.length(); j++) {
            array.append(s.charAt(j));
        }
        builder.append(s);
    }
    assertEqualsInternal("GrowingCharArray<->StringBuilder mismatch.", builder, array);
    Path tempDir = createTempDir("growingchararray");
    Path f = tempDir.resolve("GrowingCharArrayTest.tmp");
    BufferedOutputStream out = new BufferedOutputStream(Files.newOutputStream(f));
    array.flush(out);
    out.flush();
    out.close();
    BufferedInputStream in = new BufferedInputStream(Files.newInputStream(f));
    array = CharBlockArray.open(in);
    assertEqualsInternal("GrowingCharArray<->StringBuilder mismatch after flush/load.", builder, array);
    in.close();
}
Also used : Path(java.nio.file.Path) CharsetDecoder(java.nio.charset.CharsetDecoder) BufferedInputStream(java.io.BufferedInputStream) BufferedOutputStream(java.io.BufferedOutputStream) Test(org.junit.Test)

Example 82 with CharsetDecoder

use of java.nio.charset.CharsetDecoder in project lucene-solr by apache.

the class TokenInfoDictionaryBuilder method buildDictionary.

public TokenInfoDictionaryWriter buildDictionary(List<File> csvFiles) throws IOException {
    TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
    // all lines in the file
    System.out.println("  parse...");
    List<String[]> lines = new ArrayList<>(400000);
    for (File file : csvFiles) {
        FileInputStream inputStream = new FileInputStream(file);
        Charset cs = Charset.forName(encoding);
        CharsetDecoder decoder = cs.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT);
        InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
        BufferedReader reader = new BufferedReader(streamReader);
        String line = null;
        while ((line = reader.readLine()) != null) {
            String[] entry = CSVUtil.parse(line);
            if (entry.length < 13) {
                System.out.println("Entry in CSV is not valid: " + line);
                continue;
            }
            String[] formatted = formatEntry(entry);
            lines.add(formatted);
            // NFKC normalize dictionary entry
            if (normalizeEntries) {
                if (normalizer.isNormalized(entry[0])) {
                    continue;
                }
                String[] normalizedEntry = new String[entry.length];
                for (int i = 0; i < entry.length; i++) {
                    normalizedEntry[i] = normalizer.normalize(entry[i]);
                }
                formatted = formatEntry(normalizedEntry);
                lines.add(formatted);
            }
        }
    }
    System.out.println("  sort...");
    // sort by term: we sorted the files already and use a stable sort.
    Collections.sort(lines, new Comparator<String[]>() {

        public int compare(String[] left, String[] right) {
            return left[0].compareTo(right[0]);
        }
    });
    System.out.println("  encode...");
    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
    IntsRefBuilder scratch = new IntsRefBuilder();
    // first ord will be 0
    long ord = -1;
    String lastValue = null;
    // build tokeninfo dictionary
    for (String[] entry : lines) {
        int next = dictionary.put(entry);
        if (next == offset) {
            System.out.println("Failed to process line: " + Arrays.toString(entry));
            continue;
        }
        String token = entry[0];
        if (!token.equals(lastValue)) {
            // new word to add to fst
            ord++;
            lastValue = token;
            scratch.grow(token.length());
            scratch.setLength(token.length());
            for (int i = 0; i < token.length(); i++) {
                scratch.setIntAt(i, (int) token.charAt(i));
            }
            fstBuilder.add(scratch.get(), ord);
        }
        dictionary.addMapping((int) ord, offset);
        offset = next;
    }
    final FST<Long> fst = fstBuilder.finish();
    System.out.print("  " + fstBuilder.getNodeCount() + " nodes, " + fstBuilder.getArcCount() + " arcs, " + fst.ramBytesUsed() + " bytes...  ");
    dictionary.setFST(fst);
    System.out.println(" done");
    return dictionary;
}
Also used : CharsetDecoder(java.nio.charset.CharsetDecoder) InputStreamReader(java.io.InputStreamReader) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Builder(org.apache.lucene.util.fst.Builder) ArrayList(java.util.ArrayList) Charset(java.nio.charset.Charset) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) FileInputStream(java.io.FileInputStream) PositiveIntOutputs(org.apache.lucene.util.fst.PositiveIntOutputs) BufferedReader(java.io.BufferedReader) File(java.io.File)

Example 83 with CharsetDecoder

use of java.nio.charset.CharsetDecoder in project lucene-solr by apache.

the class UnknownDictionaryBuilder method readDictionaryFile.

public UnknownDictionaryWriter readDictionaryFile(String filename, String encoding) throws IOException {
    UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
    FileInputStream inputStream = new FileInputStream(filename);
    Charset cs = Charset.forName(encoding);
    CharsetDecoder decoder = cs.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT);
    InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
    LineNumberReader lineReader = new LineNumberReader(streamReader);
    dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
    List<String[]> lines = new ArrayList<>();
    String line = null;
    while ((line = lineReader.readLine()) != null) {
        // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
        // even though the unknown dictionary returns hardcoded null here.
        // Probably we don't need to validate entry
        final String[] parsed = CSVUtil.parse(line + ",*,*");
        lines.add(parsed);
    }
    Collections.sort(lines, new Comparator<String[]>() {

        public int compare(String[] left, String[] right) {
            int leftId = CharacterDefinition.lookupCharacterClass(left[0]);
            int rightId = CharacterDefinition.lookupCharacterClass(right[0]);
            return leftId - rightId;
        }
    });
    for (String[] entry : lines) {
        dictionary.put(entry);
    }
    return dictionary;
}
Also used : CharsetDecoder(java.nio.charset.CharsetDecoder) InputStreamReader(java.io.InputStreamReader) ArrayList(java.util.ArrayList) Charset(java.nio.charset.Charset) FileInputStream(java.io.FileInputStream) LineNumberReader(java.io.LineNumberReader)

Example 84 with CharsetDecoder

use of java.nio.charset.CharsetDecoder in project lucene-solr by apache.

the class LineFileDocs method open.

private synchronized void open(Random random) throws IOException {
    InputStream is = getClass().getResourceAsStream(path);
    boolean needSkip = true;
    long size = 0L, seekTo = 0L;
    if (is == null) {
        // if it's not in classpath, we load it as absolute filesystem path (e.g. Hudson's home dir)
        Path file = Paths.get(path);
        size = Files.size(file);
        if (path.endsWith(".gz")) {
            // if it is a gzip file, we need to use InputStream and slowly skipTo:
            is = Files.newInputStream(file);
        } else {
            // optimized seek using SeekableByteChannel
            seekTo = randomSeekPos(random, size);
            final SeekableByteChannel channel = Files.newByteChannel(file);
            if (LuceneTestCase.VERBOSE) {
                System.out.println("TEST: LineFileDocs: file seek to fp=" + seekTo + " on open");
            }
            channel.position(seekTo);
            is = Channels.newInputStream(channel);
            needSkip = false;
        }
    } else {
        // if the file comes from Classpath:
        size = is.available();
    }
    if (path.endsWith(".gz")) {
        is = new GZIPInputStream(is);
        // guestimate:
        size *= 2.8;
    }
    // but this seek is a scan, so very inefficient!!!
    if (needSkip) {
        seekTo = randomSeekPos(random, size);
        if (LuceneTestCase.VERBOSE) {
            System.out.println("TEST: LineFileDocs: stream skip to fp=" + seekTo + " on open");
        }
        is.skip(seekTo);
    }
    // if we seeked somewhere, read until newline char
    if (seekTo > 0L) {
        int b;
        do {
            b = is.read();
        } while (b >= 0 && b != 13 && b != 10);
    }
    CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT);
    reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);
    if (seekTo > 0L) {
        // read one more line, to make sure we are not inside a Windows linebreak (\r\n):
        reader.readLine();
    }
}
Also used : Path(java.nio.file.Path) SeekableByteChannel(java.nio.channels.SeekableByteChannel) GZIPInputStream(java.util.zip.GZIPInputStream) CharsetDecoder(java.nio.charset.CharsetDecoder) InputStreamReader(java.io.InputStreamReader) GZIPInputStream(java.util.zip.GZIPInputStream) InputStream(java.io.InputStream) BufferedReader(java.io.BufferedReader) IntPoint(org.apache.lucene.document.IntPoint)

Example 85 with CharsetDecoder

use of java.nio.charset.CharsetDecoder in project lucene-solr by apache.

the class AbstractAnalysisFactory method getSnowballWordSet.

/** same as {@link #getWordSet(ResourceLoader, String, boolean)},
   * except the input is in snowball format. */
protected final CharArraySet getSnowballWordSet(ResourceLoader loader, String wordFiles, boolean ignoreCase) throws IOException {
    List<String> files = splitFileNames(wordFiles);
    CharArraySet words = null;
    if (files.size() > 0) {
        // default stopwords list has 35 or so words, but maybe don't make it that
        // big to start
        words = new CharArraySet(files.size() * 10, ignoreCase);
        for (String file : files) {
            InputStream stream = null;
            Reader reader = null;
            try {
                stream = loader.openResource(file.trim());
                CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT);
                reader = new InputStreamReader(stream, decoder);
                WordlistLoader.getSnowballWordSet(reader, words);
            } finally {
                IOUtils.closeWhileHandlingException(reader, stream);
            }
        }
    }
    return words;
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) CharsetDecoder(java.nio.charset.CharsetDecoder) InputStreamReader(java.io.InputStreamReader) InputStream(java.io.InputStream) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader)

Aggregations

CharsetDecoder (java.nio.charset.CharsetDecoder)90 CharBuffer (java.nio.CharBuffer)45 ByteBuffer (java.nio.ByteBuffer)33 CoderResult (java.nio.charset.CoderResult)25 Charset (java.nio.charset.Charset)24 InputStreamReader (java.io.InputStreamReader)11 CharacterCodingException (java.nio.charset.CharacterCodingException)9 IOException (java.io.IOException)8 BufferedReader (java.io.BufferedReader)5 Properties (java.util.Properties)5 RegisterRequestProcessor (com.linkedin.databus.container.request.RegisterRequestProcessor)4 LogicalSource (com.linkedin.databus.core.data_model.LogicalSource)4 ChunkedWritableByteChannel (com.linkedin.databus2.core.container.ChunkedWritableByteChannel)4 DatabusRequest (com.linkedin.databus2.core.container.request.DatabusRequest)4 SchemaRegistryService (com.linkedin.databus2.schemas.SchemaRegistryService)4 SourceIdNameRegistry (com.linkedin.databus2.schemas.SourceIdNameRegistry)4 InputStream (java.io.InputStream)4 Reader (java.io.Reader)4 ArrayList (java.util.ArrayList)4 RegisterResponseEntry (com.linkedin.databus2.core.container.request.RegisterResponseEntry)3