2020-08-11 11:10:23 +02:00
|
|
|
// © 2016 and later: Unicode, Inc. and others.
|
|
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
|
|
/*
|
|
|
|
*******************************************************************************
|
|
|
|
*
|
|
|
|
* Copyright (C) 1999-2014 International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
*
|
|
|
|
*******************************************************************************
|
|
|
|
* file name: rbbidata.h
|
|
|
|
* encoding: UTF-8
|
|
|
|
* tab size: 8 (not used)
|
|
|
|
* indentation:4
|
|
|
|
*
|
|
|
|
* RBBI data formats Includes
|
|
|
|
*
|
|
|
|
* Structs that describes the format of the Binary RBBI data,
|
|
|
|
* as it is stored in ICU's data file.
|
|
|
|
*
|
|
|
|
* RBBIDataWrapper - Instances of this class sit between the
|
|
|
|
* raw data structs and the RulesBasedBreakIterator objects
|
|
|
|
* that are created by applications. The wrapper class
|
|
|
|
* provides reference counting for the underlying data,
|
|
|
|
* and direct pointers to data that would not otherwise
|
|
|
|
* be accessible without ugly pointer arithmetic. The
|
|
|
|
* wrapper does not attempt to provide any higher level
|
|
|
|
* abstractions for the data itself.
|
|
|
|
*
|
|
|
|
* There will be only one instance of RBBIDataWrapper for any
|
|
|
|
* set of RBBI run time data being shared by instances
|
|
|
|
* (clones) of RulesBasedBreakIterator.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef __RBBIDATA_H__
|
|
|
|
#define __RBBIDATA_H__
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#include "unicode/udata.h"
|
|
|
|
#include "udataswp.h"
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Swap RBBI data. See udataswp.h.
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
U_CAPI int32_t U_EXPORT2
|
|
|
|
ubrk_swap(const UDataSwapper *ds,
|
|
|
|
const void *inData, int32_t length, void *outData,
|
|
|
|
UErrorCode *pErrorCode);
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
|
|
|
|
#include "unicode/ucptrie.h"
|
|
|
|
#include "unicode/uobject.h"
|
|
|
|
#include "unicode/unistr.h"
|
|
|
|
#include "unicode/uversion.h"
|
|
|
|
#include "umutex.h"
|
|
|
|
|
|
|
|
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
// The current RBBI data format version.
|
|
|
|
static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {6, 0, 0, 0};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The following structs map exactly onto the raw data from ICU common data file.
|
|
|
|
*/
|
|
|
|
struct RBBIDataHeader {
|
|
|
|
uint32_t fMagic; /* == 0xbla0 */
|
|
|
|
UVersionInfo fFormatVersion; /* Data Format. Same as the value in struct UDataInfo */
|
|
|
|
/* if there is one associated with this data. */
|
|
|
|
/* (version originates in rbbi, is copied to UDataInfo) */
|
|
|
|
uint32_t fLength; /* Total length in bytes of this RBBI Data, */
|
|
|
|
/* including all sections, not just the header. */
|
|
|
|
uint32_t fCatCount; /* Number of character categories. */
|
|
|
|
|
|
|
|
/* */
|
|
|
|
/* Offsets and sizes of each of the subsections within the RBBI data. */
|
|
|
|
/* All offsets are bytes from the start of the RBBIDataHeader. */
|
|
|
|
/* All sizes are in bytes. */
|
|
|
|
/* */
|
|
|
|
uint32_t fFTable; /* forward state transition table. */
|
|
|
|
uint32_t fFTableLen;
|
|
|
|
uint32_t fRTable; /* Offset to the reverse state transition table. */
|
|
|
|
uint32_t fRTableLen;
|
|
|
|
uint32_t fTrie; /* Offset to Trie data for character categories */
|
|
|
|
uint32_t fTrieLen;
|
|
|
|
uint32_t fRuleSource; /* Offset to the source for for the break */
|
2023-05-23 02:05:01 +02:00
|
|
|
uint32_t fRuleSourceLen; /* rules. Stored char16_t *. */
|
2020-08-11 11:10:23 +02:00
|
|
|
uint32_t fStatusTable; /* Offset to the table of rule status values */
|
|
|
|
uint32_t fStatusTableLen;
|
|
|
|
|
|
|
|
uint32_t fReserved[6]; /* Reserved for expansion */
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
struct RBBIStateTableRowT {
|
|
|
|
T fAccepting; // Non-zero if this row is for an accepting state.
|
|
|
|
// Value 0: not an accepting state.
|
|
|
|
// 1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state.
|
|
|
|
// >1: Look-ahead match has completed.
|
|
|
|
// Actual boundary position happened earlier.
|
|
|
|
// Value here == fLookAhead in earlier
|
|
|
|
// state, at actual boundary pos.
|
|
|
|
T fLookAhead; // Non-zero if this row is for a state that
|
|
|
|
// corresponds to a '/' in the rule source.
|
|
|
|
// Value is the same as the fAccepting
|
|
|
|
// value for the rule (which will appear
|
|
|
|
// in a different state.
|
|
|
|
T fTagsIdx; // Non-zero if this row covers a {tagged} position
|
|
|
|
// from a rule. Value is the index in the
|
|
|
|
// StatusTable of the set of matching
|
|
|
|
// tags (rule status values)
|
|
|
|
T fNextState[1]; // Next State, indexed by char category.
|
|
|
|
// Variable-length array declared with length 1
|
|
|
|
// to disable bounds checkers.
|
|
|
|
// Array Size is actually fData->fHeader->fCatCount
|
|
|
|
// CAUTION: see RBBITableBuilder::getTableSize()
|
|
|
|
// before changing anything here.
|
|
|
|
};
|
|
|
|
|
|
|
|
typedef RBBIStateTableRowT<uint8_t> RBBIStateTableRow8;
|
|
|
|
typedef RBBIStateTableRowT<uint16_t> RBBIStateTableRow16;
|
|
|
|
|
|
|
|
constexpr uint16_t ACCEPTING_UNCONDITIONAL = 1; // Value constant for RBBIStateTableRow::fAccepting
|
|
|
|
|
|
|
|
union RBBIStateTableRow {
|
|
|
|
RBBIStateTableRow16 r16;
|
|
|
|
RBBIStateTableRow8 r8;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct RBBIStateTable {
|
|
|
|
uint32_t fNumStates; // Number of states.
|
|
|
|
uint32_t fRowLen; // Length of a state table row, in bytes.
|
|
|
|
uint32_t fDictCategoriesStart; // Char category number of the first dictionary
|
|
|
|
// char class, or the the largest category number + 1
|
|
|
|
// if there are no dictionary categories.
|
|
|
|
uint32_t fLookAheadResultsSize; // Size of run-time array required for holding
|
|
|
|
// look-ahead results. Indexed by row.fLookAhead.
|
|
|
|
uint32_t fFlags; // Option Flags for this state table.
|
|
|
|
char fTableData[1]; // First RBBIStateTableRow begins here.
|
|
|
|
// Variable-length array declared with length 1
|
|
|
|
// to disable bounds checkers.
|
|
|
|
// (making it char[] simplifies ugly address
|
|
|
|
// arithmetic for indexing variable length rows.)
|
|
|
|
};
|
|
|
|
|
|
|
|
constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1;
|
|
|
|
constexpr uint32_t RBBI_BOF_REQUIRED = 2;
|
|
|
|
constexpr uint32_t RBBI_8BITS_ROWS = 4;
|
|
|
|
|
|
|
|
|
|
|
|
/* */
|
|
|
|
/* The reference counting wrapper class */
|
|
|
|
/* */
|
|
|
|
class RBBIDataWrapper : public UMemory {
|
|
|
|
public:
|
|
|
|
enum EDontAdopt {
|
|
|
|
kDontAdopt
|
|
|
|
};
|
|
|
|
RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status);
|
|
|
|
RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status);
|
|
|
|
RBBIDataWrapper(UDataMemory* udm, UErrorCode &status);
|
|
|
|
~RBBIDataWrapper();
|
|
|
|
|
|
|
|
static UBool isDataVersionAcceptable(const UVersionInfo version);
|
|
|
|
|
|
|
|
void init0();
|
|
|
|
void init(const RBBIDataHeader *data, UErrorCode &status);
|
|
|
|
RBBIDataWrapper *addReference();
|
|
|
|
void removeReference();
|
2021-10-28 08:15:28 +02:00
|
|
|
bool operator ==(const RBBIDataWrapper &other) const;
|
2020-08-11 11:10:23 +02:00
|
|
|
int32_t hashCode();
|
|
|
|
const UnicodeString &getRuleSourceString() const;
|
|
|
|
void printData();
|
|
|
|
void printTable(const char *heading, const RBBIStateTable *table);
|
|
|
|
|
|
|
|
/* */
|
|
|
|
/* Pointers to items within the data */
|
|
|
|
/* */
|
|
|
|
const RBBIDataHeader *fHeader;
|
|
|
|
const RBBIStateTable *fForwardTable;
|
|
|
|
const RBBIStateTable *fReverseTable;
|
|
|
|
const char *fRuleSource;
|
|
|
|
const int32_t *fRuleStatusTable;
|
|
|
|
|
|
|
|
/* number of int32_t values in the rule status table. Used to sanity check indexing */
|
|
|
|
int32_t fStatusMaxIdx;
|
|
|
|
|
|
|
|
UCPTrie *fTrie;
|
|
|
|
|
|
|
|
private:
|
|
|
|
u_atomic_int32_t fRefCount;
|
|
|
|
UDataMemory *fUDataMem;
|
|
|
|
UnicodeString fRuleString;
|
|
|
|
UBool fDontFreeData;
|
|
|
|
|
|
|
|
RBBIDataWrapper(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */
|
|
|
|
RBBIDataWrapper &operator=(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
U_NAMESPACE_END
|
|
|
|
|
2023-05-23 02:05:01 +02:00
|
|
|
U_CFUNC UBool rbbi_cleanup();
|
2020-08-11 11:10:23 +02:00
|
|
|
|
|
|
|
#endif /* C++ */
|
|
|
|
|
|
|
|
#endif
|