// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /** ******************************************************************************* * Copyright (C) 2006-2014, International Business Machines Corporation * * and others. All Rights Reserved. * ******************************************************************************* */ #ifndef DICTBE_H #define DICTBE_H #include "unicode/utypes.h" #include "unicode/uniset.h" #include "unicode/utext.h" #include "brkeng.h" #include "hash.h" #include "mlbe.h" #include "uvectr32.h" U_NAMESPACE_BEGIN class DictionaryMatcher; class MlBreakEngine; class Normalizer2; /******************************************************************* * DictionaryBreakEngine */ /** *

DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a * dictionary to determine language-specific breaks.

* *

After it is constructed a DictionaryBreakEngine may be shared between * threads without synchronization.

*/ class DictionaryBreakEngine : public LanguageBreakEngine { private: /** * The set of characters handled by this engine * @internal */ UnicodeSet fSet; public: /** *

Constructor

*/ DictionaryBreakEngine(); /** *

Virtual destructor.

*/ virtual ~DictionaryBreakEngine(); /** *

Indicate whether this engine handles a particular character for * a particular kind of break.

* * @param c A character which begins a run that the engine might handle * @param locale The locale. * @return true if this engine handles the particular character and break * type. */ virtual UBool handles(UChar32 c, const char* locale) const override; /** *

Find any breaks within a run in the supplied text.

* * @param text A UText representing the text. The iterator is left at * the end of the run of characters which the engine is capable of handling * that starts from the first character in the range. * @param startPos The start of the run within the supplied text. * @param endPos The end of the run within the supplied text. * @param foundBreaks vector of int32_t to receive the break positions * @param status Information on any errors encountered. * @return The number of breaks found. */ virtual int32_t findBreaks( UText *text, int32_t startPos, int32_t endPos, UVector32 &foundBreaks, UBool isPhraseBreaking, UErrorCode& status ) const override; protected: /** *

Set the character set handled by this engine.

* * @param set A UnicodeSet of the set of characters handled by the engine */ virtual void setCharacters( const UnicodeSet &set ); /** *

Divide up a range of known dictionary characters handled by this break engine.

* * @param text A UText representing the text * @param rangeStart The start of the range of dictionary characters * @param rangeEnd The end of the range of dictionary characters * @param foundBreaks Output of C array of int32_t break positions, or 0 * @param status Information on any errors encountered. * @return The number of breaks found */ virtual int32_t divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, UBool isPhraseBreaking, UErrorCode& status) const = 0; }; /******************************************************************* * ThaiBreakEngine */ /** *

ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a * dictionary and heuristics to determine Thai-specific breaks.

* *

After it is constructed a ThaiBreakEngine may be shared between * threads without synchronization.

*/ class ThaiBreakEngine : public DictionaryBreakEngine { private: /** * The set of characters handled by this engine * @internal */ UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fSuffixSet; UnicodeSet fMarkSet; DictionaryMatcher *fDictionary; public: /** *

Default constructor.

* * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the * engine is deleted. */ ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); /** *

Virtual destructor.

*/ virtual ~ThaiBreakEngine(); protected: /** *

Divide up a range of known dictionary characters handled by this break engine.

* * @param text A UText representing the text * @param rangeStart The start of the range of dictionary characters * @param rangeEnd The end of the range of dictionary characters * @param foundBreaks Output of C array of int32_t break positions, or 0 * @param status Information on any errors encountered. * @return The number of breaks found */ virtual int32_t divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, UBool isPhraseBreaking, UErrorCode& status) const override; }; /******************************************************************* * LaoBreakEngine */ /** *

LaoBreakEngine is a kind of DictionaryBreakEngine that uses a * dictionary and heuristics to determine Lao-specific breaks.

* *

After it is constructed a LaoBreakEngine may be shared between * threads without synchronization.

*/ class LaoBreakEngine : public DictionaryBreakEngine { private: /** * The set of characters handled by this engine * @internal */ UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fMarkSet; DictionaryMatcher *fDictionary; public: /** *

Default constructor.

* * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the * engine is deleted. */ LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); /** *

Virtual destructor.

*/ virtual ~LaoBreakEngine(); protected: /** *

Divide up a range of known dictionary characters handled by this break engine.

* * @param text A UText representing the text * @param rangeStart The start of the range of dictionary characters * @param rangeEnd The end of the range of dictionary characters * @param foundBreaks Output of C array of int32_t break positions, or 0 * @param status Information on any errors encountered. * @return The number of breaks found */ virtual int32_t divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, UBool isPhraseBreaking, UErrorCode& status) const override; }; /******************************************************************* * BurmeseBreakEngine */ /** *

BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a * DictionaryMatcher and heuristics to determine Burmese-specific breaks.

* *

After it is constructed a BurmeseBreakEngine may be shared between * threads without synchronization.

*/ class BurmeseBreakEngine : public DictionaryBreakEngine { private: /** * The set of characters handled by this engine * @internal */ UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fMarkSet; DictionaryMatcher *fDictionary; public: /** *

Default constructor.

* * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the * engine is deleted. */ BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); /** *

Virtual destructor.

*/ virtual ~BurmeseBreakEngine(); protected: /** *

Divide up a range of known dictionary characters.

* * @param text A UText representing the text * @param rangeStart The start of the range of dictionary characters * @param rangeEnd The end of the range of dictionary characters * @param foundBreaks Output of C array of int32_t break positions, or 0 * @param status Information on any errors encountered. * @return The number of breaks found */ virtual int32_t divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, UBool isPhraseBreaking, UErrorCode& status) const override; }; /******************************************************************* * KhmerBreakEngine */ /** *

KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a * DictionaryMatcher and heuristics to determine Khmer-specific breaks.

* *

After it is constructed a KhmerBreakEngine may be shared between * threads without synchronization.

*/ class KhmerBreakEngine : public DictionaryBreakEngine { private: /** * The set of characters handled by this engine * @internal */ UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fMarkSet; DictionaryMatcher *fDictionary; public: /** *

Default constructor.

* * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the * engine is deleted. */ KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); /** *

Virtual destructor.

*/ virtual ~KhmerBreakEngine(); protected: /** *

Divide up a range of known dictionary characters.

* * @param text A UText representing the text * @param rangeStart The start of the range of dictionary characters * @param rangeEnd The end of the range of dictionary characters * @param foundBreaks Output of C array of int32_t break positions, or 0 * @param status Information on any errors encountered. * @return The number of breaks found */ virtual int32_t divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, UBool isPhraseBreaking, UErrorCode& status) const override; }; #if !UCONFIG_NO_NORMALIZATION /******************************************************************* * CjkBreakEngine */ //indicates language/script that the CjkBreakEngine will handle enum LanguageType { kKorean, kChineseJapanese }; /** *

CjkBreakEngine is a kind of DictionaryBreakEngine that uses a * dictionary with costs associated with each word and * Viterbi decoding to determine CJK-specific breaks.

*/ class CjkBreakEngine : public DictionaryBreakEngine { protected: /** * The set of characters handled by this engine * @internal */ UnicodeSet fHangulWordSet; UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; UnicodeSet fClosePunctuationSet; DictionaryMatcher *fDictionary; const Normalizer2 *nfkcNorm2; MlBreakEngine *fMlBreakEngine; bool isCj; private: // Load Japanese extensions. void loadJapaneseExtensions(UErrorCode& error); // Load Japanese Hiragana. void loadHiragana(UErrorCode& error); // Initialize fSkipSet by loading Japanese Hiragana and extensions. void initJapanesePhraseParameter(UErrorCode& error); Hashtable fSkipSet; public: /** *

Default constructor.

* * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the * engine is deleted. The DictionaryMatcher must contain costs for each word * in order for the dictionary to work properly. */ CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); /** *

Virtual destructor.

*/ virtual ~CjkBreakEngine(); protected: /** *

Divide up a range of known dictionary characters handled by this break engine.

* * @param text A UText representing the text * @param rangeStart The start of the range of dictionary characters * @param rangeEnd The end of the range of dictionary characters * @param foundBreaks Output of C array of int32_t break positions, or 0 * @param status Information on any errors encountered. * @return The number of breaks found */ virtual int32_t divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, UBool isPhraseBreaking, UErrorCode& status) const override; }; #endif U_NAMESPACE_END /* DICTBE_H */ #endif