Examples with BreakIterator - com.ibm.icu.text.BreakIterator

Example 1 with BreakIterator

use of com.ibm.icu.text.BreakIterator in project eclipse.platform.text by eclipse.

the class AbstractDecoratedTextEditor method openSaveErrorDialog.

/**
 * Presents an error dialog to the user when a problem happens during save.
 * <p>
 * Overrides the default behavior by showing a more advanced error dialog in case of encoding
 * problems.
 * </p>
 *
 * @param title the dialog title
 * @param message the message to display
 * @param exception the exception to handle
 * @since 3.6
 */
@Override
protected void openSaveErrorDialog(String title, String message, CoreException exception) {
    IStatus status = exception.getStatus();
    final IDocumentProvider documentProvider = getDocumentProvider();
    if (!(status.getCode() == IFileBufferStatusCodes.CHARSET_MAPPING_FAILED && documentProvider instanceof IStorageDocumentProvider)) {
        super.openSaveErrorDialog(title, message, exception);
        return;
    }
    final int saveAsUTF8ButtonId = IDialogConstants.OK_ID + IDialogConstants.CANCEL_ID + 1;
    final int selectUnmappableCharButtonId = saveAsUTF8ButtonId + 1;
    final Charset charset = getCharset();
    ErrorDialog errorDialog = new ErrorDialog(getSite().getShell(), title, message, status, IStatus.ERROR) {

        @Override
        protected void createButtonsForButtonBar(Composite parent) {
            super.createButtonsForButtonBar(parent);
            createButton(parent, saveAsUTF8ButtonId, TextEditorMessages.AbstractDecoratedTextEditor_save_error_Dialog_button_saveAsUTF8, false);
            if (charset != null)
                createButton(parent, selectUnmappableCharButtonId, TextEditorMessages.AbstractDecoratedTextEditor_save_error_Dialog_button_selectUnmappable, false);
        }

        @Override
        protected void buttonPressed(int id) {
            if (id == saveAsUTF8ButtonId || id == selectUnmappableCharButtonId) {
                setReturnCode(id);
                close();
            } else
                super.buttonPressed(id);
        }

        @Override
        protected boolean shouldShowDetailsButton() {
            return false;
        }
    };
    int returnCode = errorDialog.open();
    if (returnCode == saveAsUTF8ButtonId) {
        // $NON-NLS-1$
        ((IStorageDocumentProvider) documentProvider).setEncoding(getEditorInput(), "UTF-8");
        IProgressMonitor monitor = getProgressMonitor();
        try {
            doSave(monitor);
        } finally {
            monitor.done();
        }
    } else if (returnCode == selectUnmappableCharButtonId) {
        CharsetEncoder encoder = charset.newEncoder();
        IDocument document = getDocumentProvider().getDocument(getEditorInput());
        int documentLength = document.getLength();
        int offset = 0;
        BreakIterator charBreakIterator = BreakIterator.getCharacterInstance();
        charBreakIterator.setText(document.get());
        while (offset < documentLength) {
            try {
                int next = charBreakIterator.next();
                String ch = document.get(offset, next - offset);
                if (!encoder.canEncode(ch)) {
                    selectAndReveal(offset, next - offset);
                    return;
                }
                offset = next;
            } catch (BadLocationException ex) {
                EditorsPlugin.log(ex);
            // Skip this character. Showing yet another dialog here is overkill
            }
        }
    }
}

Also used : IStorageDocumentProvider(org.eclipse.ui.editors.text.IStorageDocumentProvider) IStatus(org.eclipse.core.runtime.IStatus) Composite(org.eclipse.swt.widgets.Composite) Charset(java.nio.charset.Charset) ErrorDialog(org.eclipse.jface.dialogs.ErrorDialog) CharsetEncoder(java.nio.charset.CharsetEncoder) Point(org.eclipse.swt.graphics.Point) BreakIterator(com.ibm.icu.text.BreakIterator) IProgressMonitor(org.eclipse.core.runtime.IProgressMonitor) IDocument(org.eclipse.jface.text.IDocument) BadLocationException(org.eclipse.jface.text.BadLocationException)

Example 2 with BreakIterator

use of com.ibm.icu.text.BreakIterator in project es6draft by anba.

the class SegmentIteratorPrototype method AdvanceSegmentIterator.

/**
 * AdvanceSegmentIterator ( iterator, direction )
 *
 * @param iterator
 *            the segment iterator object
 * @param direction
 *            the direction kind
 * @return {@code true} if iterator has hit the end of the string, otherwise {@code false}
 */
public static boolean AdvanceSegmentIterator(SegmentIteratorObject iterator, Direction direction) {
    /* step 1 */
    BreakIterator breakIterator = iterator.getBreakIterator();
    /* step 2 */
    String string = iterator.getString();
    /* step 3 */
    int position = iterator.getPosition();
    /* step 4 */
    if ((direction == Direction.Forwards && position >= string.length()) || (direction == Direction.Backwards && position <= 0)) {
        return true;
    }
    /* step 5 */
    int result;
    if (direction == Direction.Forwards) {
        result = breakIterator.following(position);
    } else {
        assert direction == Direction.Backwards;
        result = breakIterator.preceding(position);
    }
    /* step 6 */
    String breakType = null;
    if (result != BreakIterator.DONE) {
        switch(iterator.getGranularity()) {
            case "grapheme":
                // Always undefined.
                break;
            case "word":
                {
                    int ruleStatus = breakIterator.getRuleStatus();
                    if (BreakIterator.WORD_NONE <= ruleStatus && ruleStatus < BreakIterator.WORD_NONE_LIMIT) {
                        breakType = "none";
                    } else if (BreakIterator.WORD_NUMBER <= ruleStatus && ruleStatus < BreakIterator.WORD_NUMBER_LIMIT) {
                        breakType = "word";
                    } else if (BreakIterator.WORD_LETTER <= ruleStatus && ruleStatus < BreakIterator.WORD_LETTER_LIMIT) {
                        breakType = "word";
                    } else if (BreakIterator.WORD_KANA <= ruleStatus && ruleStatus < BreakIterator.WORD_KANA_LIMIT) {
                        breakType = "word";
                    } else if (BreakIterator.WORD_IDEO <= ruleStatus && ruleStatus < BreakIterator.WORD_IDEO_LIMIT) {
                        breakType = "word";
                    }
                    break;
                }
            case "line":
                {
                    int ruleStatus = breakIterator.getRuleStatus();
                    if (LineBreakTag.SOFT <= ruleStatus && ruleStatus < LineBreakTag.SOFT_LIMIT) {
                        breakType = "soft";
                    } else if (LineBreakTag.HARD <= ruleStatus && ruleStatus < LineBreakTag.HARD_LIMIT) {
                        breakType = "hard";
                    }
                    break;
                }
            case "sentence":
                {
                    int ruleStatus = breakIterator.getRuleStatus();
                    if (SentenceBreakTag.TERM <= ruleStatus && ruleStatus < SentenceBreakTag.TERM_LIMIT) {
                        breakType = "term";
                    } else if (SentenceBreakTag.SEP <= ruleStatus && ruleStatus < SentenceBreakTag.SEP_LIMIT) {
                        breakType = "sep";
                    }
                    break;
                }
            default:
                throw new AssertionError();
        }
    }
    iterator.setBreakType(breakType);
    /* step 7 */
    iterator.setPosition(breakIterator.current());
    /* step 8 */
    return false;
}

Also used : BreakIterator(com.ibm.icu.text.BreakIterator)

Example 3 with BreakIterator

use of com.ibm.icu.text.BreakIterator in project elasticsearch by elastic.

the class IcuTokenizerFactory method parseRules.

//parse a single RBBi rule file
private BreakIterator parseRules(String filename, Environment env) throws IOException {
    final Path path = env.configFile().resolve(filename);
    String rules = Files.readAllLines(path).stream().filter((v) -> v.startsWith("#") == false).collect(Collectors.joining("\n"));
    return new RuleBasedBreakIterator(rules.toString());
}

Also used : Path(java.nio.file.Path) ElasticsearchException(org.elasticsearch.ElasticsearchException) UCharacter(com.ibm.icu.lang.UCharacter) Tokenizer(org.apache.lucene.analysis.Tokenizer) UScript(com.ibm.icu.lang.UScript) Files(java.nio.file.Files) RuleBasedBreakIterator(com.ibm.icu.text.RuleBasedBreakIterator) Environment(org.elasticsearch.env.Environment) BreakIterator(com.ibm.icu.text.BreakIterator) IOException(java.io.IOException) HashMap(java.util.HashMap) Collectors(java.util.stream.Collectors) ICUTokenizer(org.apache.lucene.analysis.icu.segmentation.ICUTokenizer) DefaultICUTokenizerConfig(org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig) Settings(org.elasticsearch.common.settings.Settings) Map(java.util.Map) IndexSettings(org.elasticsearch.index.IndexSettings) UProperty(com.ibm.icu.lang.UProperty) Path(java.nio.file.Path) ICUTokenizerConfig(org.apache.lucene.analysis.icu.segmentation.ICUTokenizerConfig) RuleBasedBreakIterator(com.ibm.icu.text.RuleBasedBreakIterator)

Example 4 with BreakIterator

use of com.ibm.icu.text.BreakIterator in project elasticsearch by elastic.

the class IcuTokenizerFactory method getIcuConfig.

private ICUTokenizerConfig getIcuConfig(Environment env, Settings settings) {
    Map<Integer, String> tailored = new HashMap<>();
    try {
        String[] ruleFiles = settings.getAsArray(RULE_FILES);
        for (String scriptAndResourcePath : ruleFiles) {
            int colonPos = scriptAndResourcePath.indexOf(":");
            if (colonPos == -1 || colonPos == scriptAndResourcePath.length() - 1) {
                throw new IllegalArgumentException(RULE_FILES + " should contain comma-separated \"code:rulefile\" pairs");
            }
            String scriptCode = scriptAndResourcePath.substring(0, colonPos).trim();
            String resourcePath = scriptAndResourcePath.substring(colonPos + 1).trim();
            tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath);
        }
        if (tailored.isEmpty()) {
            return null;
        } else {
            final BreakIterator[] breakers = new BreakIterator[UScript.CODE_LIMIT];
            for (Map.Entry<Integer, String> entry : tailored.entrySet()) {
                int code = entry.getKey();
                String resourcePath = entry.getValue();
                breakers[code] = parseRules(resourcePath, env);
            }
            // cjkAsWords nor myanmarAsWords are not configurable yet.
            ICUTokenizerConfig config = new DefaultICUTokenizerConfig(true, true) {

                @Override
                public BreakIterator getBreakIterator(int script) {
                    if (breakers[script] != null) {
                        return (BreakIterator) breakers[script].clone();
                    } else {
                        return super.getBreakIterator(script);
                    }
                }
            };
            return config;
        }
    } catch (Exception e) {
        throw new ElasticsearchException("failed to load ICU rule files", e);
    }
}

Also used : HashMap(java.util.HashMap) ElasticsearchException(org.elasticsearch.ElasticsearchException) ElasticsearchException(org.elasticsearch.ElasticsearchException) IOException(java.io.IOException) RuleBasedBreakIterator(com.ibm.icu.text.RuleBasedBreakIterator) BreakIterator(com.ibm.icu.text.BreakIterator) DefaultICUTokenizerConfig(org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig) HashMap(java.util.HashMap) Map(java.util.Map) DefaultICUTokenizerConfig(org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig) ICUTokenizerConfig(org.apache.lucene.analysis.icu.segmentation.ICUTokenizerConfig)

Example 5 with BreakIterator

use of com.ibm.icu.text.BreakIterator in project es6draft by anba.

the class SegmenterObject method createBreakIterator.

private BreakIterator createBreakIterator() {
    ULocale locale = ULocale.forLanguageTag(this.locale);
    if ("line".equals(granularity)) {
        // "strictness" cannot be set through unicode extensions (u-lb-strict), handle here:
        locale = locale.setKeywordValue("lb", strictness);
    }
    BreakIterator breakIterator;
    switch(granularity) {
        case "grapheme":
            breakIterator = BreakIterator.getCharacterInstance(locale);
            break;
        case "word":
            breakIterator = BreakIterator.getWordInstance(locale);
            break;
        case "sentence":
            breakIterator = BreakIterator.getSentenceInstance(locale);
            break;
        case "line":
            breakIterator = BreakIterator.getLineInstance(locale);
            break;
        default:
            throw new AssertionError();
    }
    return breakIterator;
}

Also used : ULocale(com.ibm.icu.util.ULocale) BreakIterator(com.ibm.icu.text.BreakIterator)

Aggregations

BreakIterator (com.ibm.icu.text.BreakIterator)5 RuleBasedBreakIterator (com.ibm.icu.text.RuleBasedBreakIterator)2 IOException (java.io.IOException)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 DefaultICUTokenizerConfig (org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig)2 ICUTokenizerConfig (org.apache.lucene.analysis.icu.segmentation.ICUTokenizerConfig)2 ElasticsearchException (org.elasticsearch.ElasticsearchException)2 UCharacter (com.ibm.icu.lang.UCharacter)1 UProperty (com.ibm.icu.lang.UProperty)1 UScript (com.ibm.icu.lang.UScript)1 ULocale (com.ibm.icu.util.ULocale)1 Charset (java.nio.charset.Charset)1 CharsetEncoder (java.nio.charset.CharsetEncoder)1 Files (java.nio.file.Files)1 Path (java.nio.file.Path)1 Collectors (java.util.stream.Collectors)1 Tokenizer (org.apache.lucene.analysis.Tokenizer)1 ICUTokenizer (org.apache.lucene.analysis.icu.segmentation.ICUTokenizer)1 IProgressMonitor (org.eclipse.core.runtime.IProgressMonitor)1