2020-08-11 11:10:23 +02:00
|
|
|
// © 2016 and later: Unicode, Inc. and others.
|
|
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
|
|
/**
|
|
|
|
*******************************************************************************
|
|
|
|
* Copyright (C) 2006-2014, International Business Machines Corporation *
|
|
|
|
* and others. All Rights Reserved. *
|
|
|
|
*******************************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef DICTBE_H
|
|
|
|
#define DICTBE_H
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#include "unicode/uniset.h"
|
|
|
|
#include "unicode/utext.h"
|
|
|
|
|
|
|
|
#include "brkeng.h"
|
2022-05-17 17:14:19 +02:00
|
|
|
#include "hash.h"
|
2023-05-23 02:05:01 +02:00
|
|
|
#include "mlbe.h"
|
2020-08-11 11:10:23 +02:00
|
|
|
#include "uvectr32.h"
|
|
|
|
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
class DictionaryMatcher;
|
2023-05-23 02:05:01 +02:00
|
|
|
class MlBreakEngine;
|
2020-08-11 11:10:23 +02:00
|
|
|
class Normalizer2;
|
|
|
|
|
|
|
|
/*******************************************************************
|
|
|
|
* DictionaryBreakEngine
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
|
|
|
|
* dictionary to determine language-specific breaks.</p>
|
|
|
|
*
|
|
|
|
* <p>After it is constructed a DictionaryBreakEngine may be shared between
|
|
|
|
* threads without synchronization.</p>
|
|
|
|
*/
|
|
|
|
class DictionaryBreakEngine : public LanguageBreakEngine {
|
|
|
|
private:
|
|
|
|
/**
|
|
|
|
* The set of characters handled by this engine
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
|
|
|
|
UnicodeSet fSet;
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Constructor </p>
|
|
|
|
*/
|
|
|
|
DictionaryBreakEngine();
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Virtual destructor.</p>
|
|
|
|
*/
|
|
|
|
virtual ~DictionaryBreakEngine();
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Indicate whether this engine handles a particular character for
|
|
|
|
* a particular kind of break.</p>
|
|
|
|
*
|
|
|
|
* @param c A character which begins a run that the engine might handle
|
2023-11-01 07:56:12 +01:00
|
|
|
* @param locale The locale.
|
2020-08-11 11:10:23 +02:00
|
|
|
* @return true if this engine handles the particular character and break
|
|
|
|
* type.
|
|
|
|
*/
|
2023-11-01 07:56:12 +01:00
|
|
|
virtual UBool handles(UChar32 c, const char* locale) const override;
|
2020-08-11 11:10:23 +02:00
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Find any breaks within a run in the supplied text.</p>
|
|
|
|
*
|
|
|
|
* @param text A UText representing the text. The iterator is left at
|
2021-10-28 08:15:28 +02:00
|
|
|
* the end of the run of characters which the engine is capable of handling
|
2020-08-11 11:10:23 +02:00
|
|
|
* that starts from the first character in the range.
|
|
|
|
* @param startPos The start of the run within the supplied text.
|
|
|
|
* @param endPos The end of the run within the supplied text.
|
|
|
|
* @param foundBreaks vector of int32_t to receive the break positions
|
2021-10-28 08:15:28 +02:00
|
|
|
* @param status Information on any errors encountered.
|
2020-08-11 11:10:23 +02:00
|
|
|
* @return The number of breaks found.
|
|
|
|
*/
|
|
|
|
virtual int32_t findBreaks( UText *text,
|
|
|
|
int32_t startPos,
|
|
|
|
int32_t endPos,
|
2021-10-28 08:15:28 +02:00
|
|
|
UVector32 &foundBreaks,
|
2022-05-17 17:14:19 +02:00
|
|
|
UBool isPhraseBreaking,
|
2021-10-28 08:15:28 +02:00
|
|
|
UErrorCode& status ) const override;
|
2020-08-11 11:10:23 +02:00
|
|
|
|
|
|
|
protected:
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Set the character set handled by this engine.</p>
|
|
|
|
*
|
|
|
|
* @param set A UnicodeSet of the set of characters handled by the engine
|
|
|
|
*/
|
|
|
|
virtual void setCharacters( const UnicodeSet &set );
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
|
|
|
|
*
|
|
|
|
* @param text A UText representing the text
|
|
|
|
* @param rangeStart The start of the range of dictionary characters
|
|
|
|
* @param rangeEnd The end of the range of dictionary characters
|
|
|
|
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
2021-10-28 08:15:28 +02:00
|
|
|
* @param status Information on any errors encountered.
|
2020-08-11 11:10:23 +02:00
|
|
|
* @return The number of breaks found
|
|
|
|
*/
|
|
|
|
virtual int32_t divideUpDictionaryRange( UText *text,
|
|
|
|
int32_t rangeStart,
|
|
|
|
int32_t rangeEnd,
|
2021-10-28 08:15:28 +02:00
|
|
|
UVector32 &foundBreaks,
|
2022-05-17 17:14:19 +02:00
|
|
|
UBool isPhraseBreaking,
|
2021-10-28 08:15:28 +02:00
|
|
|
UErrorCode& status) const = 0;
|
2020-08-11 11:10:23 +02:00
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
/*******************************************************************
|
|
|
|
* ThaiBreakEngine
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
|
|
|
|
* dictionary and heuristics to determine Thai-specific breaks.</p>
|
|
|
|
*
|
|
|
|
* <p>After it is constructed a ThaiBreakEngine may be shared between
|
|
|
|
* threads without synchronization.</p>
|
|
|
|
*/
|
|
|
|
class ThaiBreakEngine : public DictionaryBreakEngine {
|
|
|
|
private:
|
|
|
|
/**
|
|
|
|
* The set of characters handled by this engine
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
|
|
|
|
UnicodeSet fEndWordSet;
|
|
|
|
UnicodeSet fBeginWordSet;
|
|
|
|
UnicodeSet fSuffixSet;
|
|
|
|
UnicodeSet fMarkSet;
|
|
|
|
DictionaryMatcher *fDictionary;
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Default constructor.</p>
|
|
|
|
*
|
|
|
|
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
|
|
|
|
* engine is deleted.
|
|
|
|
*/
|
|
|
|
ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Virtual destructor.</p>
|
|
|
|
*/
|
|
|
|
virtual ~ThaiBreakEngine();
|
|
|
|
|
|
|
|
protected:
|
|
|
|
/**
|
|
|
|
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
|
|
|
|
*
|
|
|
|
* @param text A UText representing the text
|
|
|
|
* @param rangeStart The start of the range of dictionary characters
|
|
|
|
* @param rangeEnd The end of the range of dictionary characters
|
|
|
|
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
2021-10-28 08:15:28 +02:00
|
|
|
* @param status Information on any errors encountered.
|
2020-08-11 11:10:23 +02:00
|
|
|
* @return The number of breaks found
|
|
|
|
*/
|
|
|
|
virtual int32_t divideUpDictionaryRange( UText *text,
|
|
|
|
int32_t rangeStart,
|
|
|
|
int32_t rangeEnd,
|
2021-10-28 08:15:28 +02:00
|
|
|
UVector32 &foundBreaks,
|
2022-05-17 17:14:19 +02:00
|
|
|
UBool isPhraseBreaking,
|
2021-10-28 08:15:28 +02:00
|
|
|
UErrorCode& status) const override;
|
2020-08-11 11:10:23 +02:00
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
/*******************************************************************
|
|
|
|
* LaoBreakEngine
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
|
|
|
|
* dictionary and heuristics to determine Lao-specific breaks.</p>
|
|
|
|
*
|
|
|
|
* <p>After it is constructed a LaoBreakEngine may be shared between
|
|
|
|
* threads without synchronization.</p>
|
|
|
|
*/
|
|
|
|
class LaoBreakEngine : public DictionaryBreakEngine {
|
|
|
|
private:
|
|
|
|
/**
|
|
|
|
* The set of characters handled by this engine
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
|
|
|
|
UnicodeSet fEndWordSet;
|
|
|
|
UnicodeSet fBeginWordSet;
|
|
|
|
UnicodeSet fMarkSet;
|
|
|
|
DictionaryMatcher *fDictionary;
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Default constructor.</p>
|
|
|
|
*
|
|
|
|
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
|
|
|
|
* engine is deleted.
|
|
|
|
*/
|
|
|
|
LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Virtual destructor.</p>
|
|
|
|
*/
|
|
|
|
virtual ~LaoBreakEngine();
|
|
|
|
|
|
|
|
protected:
|
|
|
|
/**
|
|
|
|
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
|
|
|
|
*
|
|
|
|
* @param text A UText representing the text
|
|
|
|
* @param rangeStart The start of the range of dictionary characters
|
|
|
|
* @param rangeEnd The end of the range of dictionary characters
|
|
|
|
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
2021-10-28 08:15:28 +02:00
|
|
|
* @param status Information on any errors encountered.
|
2020-08-11 11:10:23 +02:00
|
|
|
* @return The number of breaks found
|
|
|
|
*/
|
|
|
|
virtual int32_t divideUpDictionaryRange( UText *text,
|
|
|
|
int32_t rangeStart,
|
|
|
|
int32_t rangeEnd,
|
2021-10-28 08:15:28 +02:00
|
|
|
UVector32 &foundBreaks,
|
2022-05-17 17:14:19 +02:00
|
|
|
UBool isPhraseBreaking,
|
2021-10-28 08:15:28 +02:00
|
|
|
UErrorCode& status) const override;
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
/*******************************************************************
|
|
|
|
* BurmeseBreakEngine
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
|
|
|
|
* DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
|
|
|
|
*
|
|
|
|
* <p>After it is constructed a BurmeseBreakEngine may be shared between
|
|
|
|
* threads without synchronization.</p>
|
|
|
|
*/
|
|
|
|
class BurmeseBreakEngine : public DictionaryBreakEngine {
|
|
|
|
private:
|
|
|
|
/**
|
|
|
|
* The set of characters handled by this engine
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
|
|
|
|
UnicodeSet fEndWordSet;
|
|
|
|
UnicodeSet fBeginWordSet;
|
|
|
|
UnicodeSet fMarkSet;
|
|
|
|
DictionaryMatcher *fDictionary;
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Default constructor.</p>
|
|
|
|
*
|
|
|
|
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
|
|
|
|
* engine is deleted.
|
|
|
|
*/
|
|
|
|
BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Virtual destructor.</p>
|
|
|
|
*/
|
|
|
|
virtual ~BurmeseBreakEngine();
|
|
|
|
|
|
|
|
protected:
|
|
|
|
/**
|
|
|
|
* <p>Divide up a range of known dictionary characters.</p>
|
|
|
|
*
|
|
|
|
* @param text A UText representing the text
|
|
|
|
* @param rangeStart The start of the range of dictionary characters
|
|
|
|
* @param rangeEnd The end of the range of dictionary characters
|
|
|
|
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
|
|
|
* @param status Information on any errors encountered.
|
|
|
|
* @return The number of breaks found
|
|
|
|
*/
|
|
|
|
virtual int32_t divideUpDictionaryRange( UText *text,
|
|
|
|
int32_t rangeStart,
|
|
|
|
int32_t rangeEnd,
|
|
|
|
UVector32 &foundBreaks,
|
2022-05-17 17:14:19 +02:00
|
|
|
UBool isPhraseBreaking,
|
2021-10-28 08:15:28 +02:00
|
|
|
UErrorCode& status) const override;
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
/*******************************************************************
|
|
|
|
* KhmerBreakEngine
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
|
|
|
|
* DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
|
|
|
|
*
|
|
|
|
* <p>After it is constructed a KhmerBreakEngine may be shared between
|
|
|
|
* threads without synchronization.</p>
|
|
|
|
*/
|
|
|
|
class KhmerBreakEngine : public DictionaryBreakEngine {
|
|
|
|
private:
|
|
|
|
/**
|
|
|
|
* The set of characters handled by this engine
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
|
|
|
|
UnicodeSet fEndWordSet;
|
|
|
|
UnicodeSet fBeginWordSet;
|
|
|
|
UnicodeSet fMarkSet;
|
|
|
|
DictionaryMatcher *fDictionary;
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Default constructor.</p>
|
|
|
|
*
|
|
|
|
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
|
|
|
|
* engine is deleted.
|
|
|
|
*/
|
|
|
|
KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Virtual destructor.</p>
|
|
|
|
*/
|
|
|
|
virtual ~KhmerBreakEngine();
|
|
|
|
|
|
|
|
protected:
|
|
|
|
/**
|
|
|
|
* <p>Divide up a range of known dictionary characters.</p>
|
|
|
|
*
|
|
|
|
* @param text A UText representing the text
|
|
|
|
* @param rangeStart The start of the range of dictionary characters
|
|
|
|
* @param rangeEnd The end of the range of dictionary characters
|
|
|
|
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
|
|
|
* @param status Information on any errors encountered.
|
|
|
|
* @return The number of breaks found
|
|
|
|
*/
|
|
|
|
virtual int32_t divideUpDictionaryRange( UText *text,
|
|
|
|
int32_t rangeStart,
|
|
|
|
int32_t rangeEnd,
|
|
|
|
UVector32 &foundBreaks,
|
2022-05-17 17:14:19 +02:00
|
|
|
UBool isPhraseBreaking,
|
2021-10-28 08:15:28 +02:00
|
|
|
UErrorCode& status) const override;
|
2020-08-11 11:10:23 +02:00
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
#if !UCONFIG_NO_NORMALIZATION
|
|
|
|
|
|
|
|
/*******************************************************************
|
|
|
|
* CjkBreakEngine
|
|
|
|
*/
|
|
|
|
|
|
|
|
//indicates language/script that the CjkBreakEngine will handle
|
|
|
|
enum LanguageType {
|
|
|
|
kKorean,
|
|
|
|
kChineseJapanese
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
|
|
|
|
* dictionary with costs associated with each word and
|
|
|
|
* Viterbi decoding to determine CJK-specific breaks.</p>
|
|
|
|
*/
|
|
|
|
class CjkBreakEngine : public DictionaryBreakEngine {
|
|
|
|
protected:
|
|
|
|
/**
|
|
|
|
* The set of characters handled by this engine
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
UnicodeSet fHangulWordSet;
|
2022-05-17 17:14:19 +02:00
|
|
|
UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
|
|
|
|
UnicodeSet fClosePunctuationSet;
|
2020-08-11 11:10:23 +02:00
|
|
|
|
|
|
|
DictionaryMatcher *fDictionary;
|
|
|
|
const Normalizer2 *nfkcNorm2;
|
2023-05-23 02:05:01 +02:00
|
|
|
MlBreakEngine *fMlBreakEngine;
|
|
|
|
bool isCj;
|
2020-08-11 11:10:23 +02:00
|
|
|
|
2022-05-17 17:14:19 +02:00
|
|
|
private:
|
|
|
|
// Load Japanese extensions.
|
|
|
|
void loadJapaneseExtensions(UErrorCode& error);
|
|
|
|
// Load Japanese Hiragana.
|
|
|
|
void loadHiragana(UErrorCode& error);
|
|
|
|
// Initialize fSkipSet by loading Japanese Hiragana and extensions.
|
|
|
|
void initJapanesePhraseParameter(UErrorCode& error);
|
|
|
|
|
|
|
|
Hashtable fSkipSet;
|
|
|
|
|
2020-08-11 11:10:23 +02:00
|
|
|
public:
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Default constructor.</p>
|
|
|
|
*
|
|
|
|
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
|
|
|
|
* engine is deleted. The DictionaryMatcher must contain costs for each word
|
|
|
|
* in order for the dictionary to work properly.
|
|
|
|
*/
|
|
|
|
CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* <p>Virtual destructor.</p>
|
|
|
|
*/
|
|
|
|
virtual ~CjkBreakEngine();
|
|
|
|
|
|
|
|
protected:
|
|
|
|
/**
|
|
|
|
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
|
|
|
|
*
|
|
|
|
* @param text A UText representing the text
|
|
|
|
* @param rangeStart The start of the range of dictionary characters
|
|
|
|
* @param rangeEnd The end of the range of dictionary characters
|
|
|
|
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
2021-10-28 08:15:28 +02:00
|
|
|
* @param status Information on any errors encountered.
|
2020-08-11 11:10:23 +02:00
|
|
|
* @return The number of breaks found
|
|
|
|
*/
|
|
|
|
virtual int32_t divideUpDictionaryRange( UText *text,
|
|
|
|
int32_t rangeStart,
|
|
|
|
int32_t rangeEnd,
|
2021-10-28 08:15:28 +02:00
|
|
|
UVector32 &foundBreaks,
|
2022-05-17 17:14:19 +02:00
|
|
|
UBool isPhraseBreaking,
|
2021-10-28 08:15:28 +02:00
|
|
|
UErrorCode& status) const override;
|
2020-08-11 11:10:23 +02:00
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
U_NAMESPACE_END
|
|
|
|
|
|
|
|
/* DICTBE_H */
|
|
|
|
#endif
|