a5ee5011ca
Changelog: https://github.com/PCRE2Project/pcre2/blob/pcre2-10.40/ChangeLog
(cherry picked from commit fd6eb2c2d2
)
394 lines
10 KiB
C++
394 lines
10 KiB
C++
/*************************************************
|
|
* Perl-Compatible Regular Expressions *
|
|
*************************************************/
|
|
|
|
/* PCRE is a library of functions to support regular expressions whose syntax
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
Original API code Copyright (c) 1997-2012 University of Cambridge
|
|
New API code Copyright (c) 2016-2022 University of Cambridge
|
|
|
|
This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
|
|
Instead, modify the maint/GenerateUcpHeader.py script and run it to generate
|
|
a new version of this code.
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
|
|
* Redistributions of source code must retain the above copyright notice,
|
|
this list of conditions and the following disclaimer.
|
|
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
|
|
* Neither the name of the University of Cambridge nor the names of its
|
|
contributors may be used to endorse or promote products derived from
|
|
this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
POSSIBILITY OF SUCH DAMAGE.
|
|
-----------------------------------------------------------------------------
|
|
*/
|
|
|
|
#ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD
|
|
#define PCRE2_UCP_H_IDEMPOTENT_GUARD
|
|
|
|
/* This file contains definitions of the Unicode property values that are
|
|
returned by the UCD access macros and used throughout PCRE2.
|
|
|
|
IMPORTANT: The specific values of the first two enums (general and particular
|
|
character categories) are assumed by the table called catposstab in the file
|
|
pcre2_auto_possess.c. They are unlikely to change, but should be checked after
|
|
an update. */
|
|
|
|
/* These are the general character categories. */
|
|
|
|
enum {
|
|
ucp_C,
|
|
ucp_L,
|
|
ucp_M,
|
|
ucp_N,
|
|
ucp_P,
|
|
ucp_S,
|
|
ucp_Z,
|
|
};
|
|
|
|
/* These are the particular character categories. */
|
|
|
|
enum {
|
|
ucp_Cc, /* Control */
|
|
ucp_Cf, /* Format */
|
|
ucp_Cn, /* Unassigned */
|
|
ucp_Co, /* Private use */
|
|
ucp_Cs, /* Surrogate */
|
|
ucp_Ll, /* Lower case letter */
|
|
ucp_Lm, /* Modifier letter */
|
|
ucp_Lo, /* Other letter */
|
|
ucp_Lt, /* Title case letter */
|
|
ucp_Lu, /* Upper case letter */
|
|
ucp_Mc, /* Spacing mark */
|
|
ucp_Me, /* Enclosing mark */
|
|
ucp_Mn, /* Non-spacing mark */
|
|
ucp_Nd, /* Decimal number */
|
|
ucp_Nl, /* Letter number */
|
|
ucp_No, /* Other number */
|
|
ucp_Pc, /* Connector punctuation */
|
|
ucp_Pd, /* Dash punctuation */
|
|
ucp_Pe, /* Close punctuation */
|
|
ucp_Pf, /* Final punctuation */
|
|
ucp_Pi, /* Initial punctuation */
|
|
ucp_Po, /* Other punctuation */
|
|
ucp_Ps, /* Open punctuation */
|
|
ucp_Sc, /* Currency symbol */
|
|
ucp_Sk, /* Modifier symbol */
|
|
ucp_Sm, /* Mathematical symbol */
|
|
ucp_So, /* Other symbol */
|
|
ucp_Zl, /* Line separator */
|
|
ucp_Zp, /* Paragraph separator */
|
|
ucp_Zs, /* Space separator */
|
|
};
|
|
|
|
/* These are Boolean properties. */
|
|
|
|
enum {
|
|
ucp_ASCII,
|
|
ucp_ASCII_Hex_Digit,
|
|
ucp_Alphabetic,
|
|
ucp_Bidi_Control,
|
|
ucp_Bidi_Mirrored,
|
|
ucp_Case_Ignorable,
|
|
ucp_Cased,
|
|
ucp_Changes_When_Casefolded,
|
|
ucp_Changes_When_Casemapped,
|
|
ucp_Changes_When_Lowercased,
|
|
ucp_Changes_When_Titlecased,
|
|
ucp_Changes_When_Uppercased,
|
|
ucp_Dash,
|
|
ucp_Default_Ignorable_Code_Point,
|
|
ucp_Deprecated,
|
|
ucp_Diacritic,
|
|
ucp_Emoji,
|
|
ucp_Emoji_Component,
|
|
ucp_Emoji_Modifier,
|
|
ucp_Emoji_Modifier_Base,
|
|
ucp_Emoji_Presentation,
|
|
ucp_Extended_Pictographic,
|
|
ucp_Extender,
|
|
ucp_Grapheme_Base,
|
|
ucp_Grapheme_Extend,
|
|
ucp_Grapheme_Link,
|
|
ucp_Hex_Digit,
|
|
ucp_IDS_Binary_Operator,
|
|
ucp_IDS_Trinary_Operator,
|
|
ucp_ID_Continue,
|
|
ucp_ID_Start,
|
|
ucp_Ideographic,
|
|
ucp_Join_Control,
|
|
ucp_Logical_Order_Exception,
|
|
ucp_Lowercase,
|
|
ucp_Math,
|
|
ucp_Noncharacter_Code_Point,
|
|
ucp_Pattern_Syntax,
|
|
ucp_Pattern_White_Space,
|
|
ucp_Prepended_Concatenation_Mark,
|
|
ucp_Quotation_Mark,
|
|
ucp_Radical,
|
|
ucp_Regional_Indicator,
|
|
ucp_Sentence_Terminal,
|
|
ucp_Soft_Dotted,
|
|
ucp_Terminal_Punctuation,
|
|
ucp_Unified_Ideograph,
|
|
ucp_Uppercase,
|
|
ucp_Variation_Selector,
|
|
ucp_White_Space,
|
|
ucp_XID_Continue,
|
|
ucp_XID_Start,
|
|
/* This must be last */
|
|
ucp_Bprop_Count
|
|
};
|
|
|
|
/* Size of entries in ucd_boolprop_sets[] */
|
|
|
|
#define ucd_boolprop_sets_item_size 2
|
|
|
|
/* These are the bidi class values. */
|
|
|
|
enum {
|
|
ucp_bidiAL, /* Arabic letter */
|
|
ucp_bidiAN, /* Arabic number */
|
|
ucp_bidiB, /* Paragraph separator */
|
|
ucp_bidiBN, /* Boundary neutral */
|
|
ucp_bidiCS, /* Common separator */
|
|
ucp_bidiEN, /* European number */
|
|
ucp_bidiES, /* European separator */
|
|
ucp_bidiET, /* European terminator */
|
|
ucp_bidiFSI, /* First strong isolate */
|
|
ucp_bidiL, /* Left to right */
|
|
ucp_bidiLRE, /* Left to right embedding */
|
|
ucp_bidiLRI, /* Left to right isolate */
|
|
ucp_bidiLRO, /* Left to right override */
|
|
ucp_bidiNSM, /* Non-spacing mark */
|
|
ucp_bidiON, /* Other neutral */
|
|
ucp_bidiPDF, /* Pop directional format */
|
|
ucp_bidiPDI, /* Pop directional isolate */
|
|
ucp_bidiR, /* Right to left */
|
|
ucp_bidiRLE, /* Right to left embedding */
|
|
ucp_bidiRLI, /* Right to left isolate */
|
|
ucp_bidiRLO, /* Right to left override */
|
|
ucp_bidiS, /* Segment separator */
|
|
ucp_bidiWS, /* White space */
|
|
};
|
|
|
|
/* These are grapheme break properties. The Extended Pictographic property
|
|
comes from the emoji-data.txt file. */
|
|
|
|
enum {
|
|
ucp_gbCR, /* 0 */
|
|
ucp_gbLF, /* 1 */
|
|
ucp_gbControl, /* 2 */
|
|
ucp_gbExtend, /* 3 */
|
|
ucp_gbPrepend, /* 4 */
|
|
ucp_gbSpacingMark, /* 5 */
|
|
ucp_gbL, /* 6 Hangul syllable type L */
|
|
ucp_gbV, /* 7 Hangul syllable type V */
|
|
ucp_gbT, /* 8 Hangul syllable type T */
|
|
ucp_gbLV, /* 9 Hangul syllable type LV */
|
|
ucp_gbLVT, /* 10 Hangul syllable type LVT */
|
|
ucp_gbRegional_Indicator, /* 11 */
|
|
ucp_gbOther, /* 12 */
|
|
ucp_gbZWJ, /* 13 */
|
|
ucp_gbExtended_Pictographic, /* 14 */
|
|
};
|
|
|
|
/* These are the script identifications. */
|
|
|
|
enum {
|
|
/* Scripts which has characters in other scripts. */
|
|
ucp_Latin,
|
|
ucp_Greek,
|
|
ucp_Cyrillic,
|
|
ucp_Arabic,
|
|
ucp_Syriac,
|
|
ucp_Thaana,
|
|
ucp_Devanagari,
|
|
ucp_Bengali,
|
|
ucp_Gurmukhi,
|
|
ucp_Gujarati,
|
|
ucp_Oriya,
|
|
ucp_Tamil,
|
|
ucp_Telugu,
|
|
ucp_Kannada,
|
|
ucp_Malayalam,
|
|
ucp_Sinhala,
|
|
ucp_Myanmar,
|
|
ucp_Georgian,
|
|
ucp_Hangul,
|
|
ucp_Mongolian,
|
|
ucp_Hiragana,
|
|
ucp_Katakana,
|
|
ucp_Bopomofo,
|
|
ucp_Han,
|
|
ucp_Yi,
|
|
ucp_Tagalog,
|
|
ucp_Hanunoo,
|
|
ucp_Buhid,
|
|
ucp_Tagbanwa,
|
|
ucp_Limbu,
|
|
ucp_Tai_Le,
|
|
ucp_Linear_B,
|
|
ucp_Cypriot,
|
|
ucp_Buginese,
|
|
ucp_Coptic,
|
|
ucp_Glagolitic,
|
|
ucp_Syloti_Nagri,
|
|
ucp_Phags_Pa,
|
|
ucp_Nko,
|
|
ucp_Kayah_Li,
|
|
ucp_Javanese,
|
|
ucp_Kaithi,
|
|
ucp_Mandaic,
|
|
ucp_Chakma,
|
|
ucp_Sharada,
|
|
ucp_Takri,
|
|
ucp_Duployan,
|
|
ucp_Grantha,
|
|
ucp_Khojki,
|
|
ucp_Linear_A,
|
|
ucp_Mahajani,
|
|
ucp_Manichaean,
|
|
ucp_Modi,
|
|
ucp_Old_Permic,
|
|
ucp_Psalter_Pahlavi,
|
|
ucp_Khudawadi,
|
|
ucp_Tirhuta,
|
|
ucp_Multani,
|
|
ucp_Adlam,
|
|
ucp_Masaram_Gondi,
|
|
ucp_Dogra,
|
|
ucp_Gunjala_Gondi,
|
|
ucp_Hanifi_Rohingya,
|
|
ucp_Sogdian,
|
|
ucp_Nandinagari,
|
|
ucp_Yezidi,
|
|
ucp_Cypro_Minoan,
|
|
ucp_Old_Uyghur,
|
|
|
|
/* Scripts which has no characters in other scripts. */
|
|
ucp_Unknown,
|
|
ucp_Common,
|
|
ucp_Armenian,
|
|
ucp_Hebrew,
|
|
ucp_Thai,
|
|
ucp_Lao,
|
|
ucp_Tibetan,
|
|
ucp_Ethiopic,
|
|
ucp_Cherokee,
|
|
ucp_Canadian_Aboriginal,
|
|
ucp_Ogham,
|
|
ucp_Runic,
|
|
ucp_Khmer,
|
|
ucp_Old_Italic,
|
|
ucp_Gothic,
|
|
ucp_Deseret,
|
|
ucp_Inherited,
|
|
ucp_Ugaritic,
|
|
ucp_Shavian,
|
|
ucp_Osmanya,
|
|
ucp_Braille,
|
|
ucp_New_Tai_Lue,
|
|
ucp_Tifinagh,
|
|
ucp_Old_Persian,
|
|
ucp_Kharoshthi,
|
|
ucp_Balinese,
|
|
ucp_Cuneiform,
|
|
ucp_Phoenician,
|
|
ucp_Sundanese,
|
|
ucp_Lepcha,
|
|
ucp_Ol_Chiki,
|
|
ucp_Vai,
|
|
ucp_Saurashtra,
|
|
ucp_Rejang,
|
|
ucp_Lycian,
|
|
ucp_Carian,
|
|
ucp_Lydian,
|
|
ucp_Cham,
|
|
ucp_Tai_Tham,
|
|
ucp_Tai_Viet,
|
|
ucp_Avestan,
|
|
ucp_Egyptian_Hieroglyphs,
|
|
ucp_Samaritan,
|
|
ucp_Lisu,
|
|
ucp_Bamum,
|
|
ucp_Meetei_Mayek,
|
|
ucp_Imperial_Aramaic,
|
|
ucp_Old_South_Arabian,
|
|
ucp_Inscriptional_Parthian,
|
|
ucp_Inscriptional_Pahlavi,
|
|
ucp_Old_Turkic,
|
|
ucp_Batak,
|
|
ucp_Brahmi,
|
|
ucp_Meroitic_Cursive,
|
|
ucp_Meroitic_Hieroglyphs,
|
|
ucp_Miao,
|
|
ucp_Sora_Sompeng,
|
|
ucp_Caucasian_Albanian,
|
|
ucp_Bassa_Vah,
|
|
ucp_Elbasan,
|
|
ucp_Pahawh_Hmong,
|
|
ucp_Mende_Kikakui,
|
|
ucp_Mro,
|
|
ucp_Old_North_Arabian,
|
|
ucp_Nabataean,
|
|
ucp_Palmyrene,
|
|
ucp_Pau_Cin_Hau,
|
|
ucp_Siddham,
|
|
ucp_Warang_Citi,
|
|
ucp_Ahom,
|
|
ucp_Anatolian_Hieroglyphs,
|
|
ucp_Hatran,
|
|
ucp_Old_Hungarian,
|
|
ucp_SignWriting,
|
|
ucp_Bhaiksuki,
|
|
ucp_Marchen,
|
|
ucp_Newa,
|
|
ucp_Osage,
|
|
ucp_Tangut,
|
|
ucp_Nushu,
|
|
ucp_Soyombo,
|
|
ucp_Zanabazar_Square,
|
|
ucp_Makasar,
|
|
ucp_Medefaidrin,
|
|
ucp_Old_Sogdian,
|
|
ucp_Elymaic,
|
|
ucp_Nyiakeng_Puachue_Hmong,
|
|
ucp_Wancho,
|
|
ucp_Chorasmian,
|
|
ucp_Dives_Akuru,
|
|
ucp_Khitan_Small_Script,
|
|
ucp_Tangsa,
|
|
ucp_Toto,
|
|
ucp_Vithkuqi,
|
|
|
|
/* This must be last */
|
|
ucp_Script_Count
|
|
};
|
|
|
|
/* Size of entries in ucd_script_sets[] */
|
|
|
|
#define ucd_script_sets_item_size 3
|
|
|
|
#endif /* PCRE2_UCP_H_IDEMPOTENT_GUARD */
|
|
|
|
/* End of pcre2_ucp.h */
|