ICU4C: Update to version 74.1

This commit is contained in:
bruvzg 2023-11-01 08:56:12 +02:00
parent 6afd320984
commit 5e55c6c611
No known key found for this signature in database
GPG key ID: 7960FCF39844EC38
65 changed files with 8399 additions and 6799 deletions

View file

@ -401,6 +401,8 @@ if env["builtin_icu4c"]:
"common/uloc.cpp",
"common/uloc_keytype.cpp",
"common/uloc_tag.cpp",
"common/ulocale.cpp",
"common/ulocbuilder.cpp",
"common/umapfile.cpp",
"common/umath.cpp",
"common/umutablecptrie.cpp",
@ -466,7 +468,7 @@ if env["builtin_icu4c"]:
]
thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources]
icu_data_name = "icudt73l.dat"
icu_data_name = "icudt74l.dat"
if env.editor_build:
env_icu.Depends("#thirdparty/icu4c/icudata.gen.h", "#thirdparty/icu4c/" + icu_data_name)

View file

@ -623,6 +623,8 @@ thirdparty_icu_sources = [
"common/uloc.cpp",
"common/uloc_keytype.cpp",
"common/uloc_tag.cpp",
"common/ulocale.cpp",
"common/ulocbuilder.cpp",
"common/umapfile.cpp",
"common/umath.cpp",
"common/umutablecptrie.cpp",
@ -688,7 +690,7 @@ thirdparty_icu_sources = [
]
thirdparty_icu_sources = [thirdparty_icu_dir + file for file in thirdparty_icu_sources]
icu_data_name = "icudt73l.dat"
icu_data_name = "icudt74l.dat"
if env["static_icu_data"]:
env_icu.Depends("../../../thirdparty/icu4c/icudata.gen.h", "../../../thirdparty/icu4c/" + icu_data_name)

View file

@ -353,7 +353,7 @@ Files extracted from upstream source:
## icu4c
- Upstream: https://github.com/unicode-org/icu
- Version: 73.2 (680f521746a3bd6a86f25f25ee50a62d88b489cf, 2023)
- Version: 74.1 (9edac7b78327a1cb58db29e2714b15f9fa14e4d7, 2023)
- License: Unicode
Files extracted from upstream source:
@ -365,7 +365,7 @@ Files extracted from upstream source:
Files generated from upstream source:
- The `icudt73l.dat` built with the provided `godot_data.json` config file (see
- The `icudt74l.dat` built with the provided `godot_data.json` config file (see
https://github.com/unicode-org/icu/blob/master/docs/userguide/icu_data/buildtool.md
for instructions).
@ -375,7 +375,7 @@ Files generated from upstream source:
3. Reconfigure ICU with custom data config:
`ICU_DATA_FILTER_FILE={GODOT_SOURCE}/thirdparty/icu4c/godot_data.json ./runConfigureICU {PLATFORM} --with-data-packaging=common`
4. Delete `data/out` folder and rebuild data: `cd data && rm -rf ./out && make`
5. Copy `source/data/out/icudt73l.dat` to the `{GODOT_SOURCE}/thirdparty/icu4c/icudt73l.dat`
5. Copy `source/data/out/icudt74l.dat` to the `{GODOT_SOURCE}/thirdparty/icu4c/icudt74l.dat`
## jpeg-compressor

View file

@ -1,49 +1,42 @@
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
See Terms of Use <https://www.unicode.org/copyright.html>
for definitions of Unicode Inc.s Data Files and Software.
NOTICE TO USER: Carefully read the following legal agreement.
BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT.
IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
THE DATA FILES OR SOFTWARE.
UNICODE LICENSE V3
COPYRIGHT AND PERMISSION NOTICE
Copyright © 1991-2023 Unicode, Inc. All rights reserved.
Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
Copyright © 2016-2023 Unicode, Inc.
Permission is hereby granted, free of charge, to any person obtaining
a copy of the Unicode data files and any associated documentation
(the "Data Files") or Unicode software and any associated documentation
(the "Software") to deal in the Data Files or Software
without restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, and/or sell copies of
the Data Files or Software, and to permit persons to whom the Data Files
or Software are furnished to do so, provided that either
(a) this copyright and permission notice appear with all copies
of the Data Files or Software, or
(b) this copyright and permission notice appear in associated
Documentation.
NOTICE TO USER: Carefully read the following legal agreement. BY
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THE DATA FILES OR SOFTWARE.
Permission is hereby granted, free of charge, to any person obtaining a
copy of data files and any associated documentation (the "Data Files") or
software and any associated documentation (the "Software") to deal in the
Data Files or Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Data Files or Software, and to permit persons to whom the
Data Files or Software are furnished to do so, provided that either (a)
this copyright and permission notice appear with all copies of the Data
Files or Software, or (b) this copyright and permission notice appear in
associated Documentation.
Except as contained in this notice, the name of a copyright holder
shall not be used in advertising or otherwise to promote the sale,
use or other dealings in these Data Files or Software without prior
written authorization of the copyright holder.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder shall
not be used in advertising or otherwise to promote the sale, use or other
dealings in these Data Files or Software without prior written
authorization of the copyright holder.
----------------------------------------------------------------------

View file

@ -21,6 +21,7 @@
#include "unicode/uscript.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "unicode/rbbi.h"
#include "brkeng.h"
#include "cmemory.h"
@ -70,19 +71,21 @@ UnhandledEngine::~UnhandledEngine() {
}
UBool
UnhandledEngine::handles(UChar32 c) const {
UnhandledEngine::handles(UChar32 c, const char* locale) const {
(void)locale; // Unused
return fHandled && fHandled->contains(c);
}
int32_t
UnhandledEngine::findBreaks( UText *text,
int32_t /* startPos */,
int32_t startPos,
int32_t endPos,
UVector32 &/*foundBreaks*/,
UBool /* isPhraseBreaking */,
UErrorCode &status) const {
if (U_FAILURE(status)) return 0;
UChar32 c = utext_current32(text);
utext_setNativeIndex(text, startPos);
UChar32 c = utext_current32(text);
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
utext_next32(text); // TODO: recast loop to work with post-increment operations.
c = utext_current32(text);
@ -120,41 +123,39 @@ ICULanguageBreakFactory::~ICULanguageBreakFactory() {
}
}
U_NAMESPACE_END
U_CDECL_BEGIN
static void U_CALLCONV _deleteEngine(void *obj) {
delete (const icu::LanguageBreakEngine *) obj;
void ICULanguageBreakFactory::ensureEngines(UErrorCode& status) {
static UMutex gBreakEngineMutex;
Mutex m(&gBreakEngineMutex);
if (fEngines == nullptr) {
LocalPointer<UStack> engines(new UStack(uprv_deleteUObject, nullptr, status), status);
if (U_SUCCESS(status)) {
fEngines = engines.orphan();
}
}
}
U_CDECL_END
U_NAMESPACE_BEGIN
const LanguageBreakEngine *
ICULanguageBreakFactory::getEngineFor(UChar32 c) {
ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) {
const LanguageBreakEngine *lbe = nullptr;
UErrorCode status = U_ZERO_ERROR;
ensureEngines(status);
if (U_FAILURE(status) ) {
// Note: no way to return error code to caller.
return nullptr;
}
static UMutex gBreakEngineMutex;
Mutex m(&gBreakEngineMutex);
if (fEngines == nullptr) {
LocalPointer<UStack> engines(new UStack(_deleteEngine, nullptr, status), status);
if (U_FAILURE(status) ) {
// Note: no way to return error code to caller.
return nullptr;
}
fEngines = engines.orphan();
} else {
int32_t i = fEngines->size();
while (--i >= 0) {
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
if (lbe != nullptr && lbe->handles(c)) {
return lbe;
}
int32_t i = fEngines->size();
while (--i >= 0) {
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
if (lbe != nullptr && lbe->handles(c, locale)) {
return lbe;
}
}
// We didn't find an engine. Create one.
lbe = loadEngineFor(c);
lbe = loadEngineFor(c, locale);
if (lbe != nullptr) {
fEngines->push((void *)lbe, status);
}
@ -162,7 +163,7 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c) {
}
const LanguageBreakEngine *
ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
UErrorCode status = U_ZERO_ERROR;
UScriptCode code = uscript_getScript(c, &status);
if (U_SUCCESS(status)) {
@ -299,6 +300,70 @@ ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
return nullptr;
}
void ICULanguageBreakFactory::addExternalEngine(
ExternalBreakEngine* external, UErrorCode& status) {
LocalPointer<ExternalBreakEngine> engine(external, status);
ensureEngines(status);
LocalPointer<BreakEngineWrapper> wrapper(
new BreakEngineWrapper(engine.orphan(), status), status);
static UMutex gBreakEngineMutex;
Mutex m(&gBreakEngineMutex);
fEngines->push(wrapper.getAlias(), status);
wrapper.orphan();
}
BreakEngineWrapper::BreakEngineWrapper(
ExternalBreakEngine* engine, UErrorCode &status) : delegate(engine, status) {
}
BreakEngineWrapper::~BreakEngineWrapper() {
}
UBool BreakEngineWrapper::handles(UChar32 c, const char* locale) const {
return delegate->isFor(c, locale);
}
int32_t BreakEngineWrapper::findBreaks(
UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
UBool /* isPhraseBreaking */,
UErrorCode &status) const {
if (U_FAILURE(status)) return 0;
int32_t result = 0;
// Find the span of characters included in the set.
// The span to break begins at the current position in the text, and
// extends towards the start or end of the text, depending on 'reverse'.
utext_setNativeIndex(text, startPos);
int32_t start = (int32_t)utext_getNativeIndex(text);
int32_t current;
int32_t rangeStart;
int32_t rangeEnd;
UChar32 c = utext_current32(text);
while((current = (int32_t)utext_getNativeIndex(text)) < endPos && delegate->handles(c)) {
utext_next32(text); // TODO: recast loop for postincrement
c = utext_current32(text);
}
rangeStart = start;
rangeEnd = current;
int32_t beforeSize = foundBreaks.size();
int32_t additionalCapacity = rangeEnd - rangeStart + 1;
// enlarge to contains (rangeEnd-rangeStart+1) more items
foundBreaks.ensureCapacity(beforeSize+additionalCapacity, status);
if (U_FAILURE(status)) return 0;
foundBreaks.setSize(beforeSize + beforeSize+additionalCapacity);
result = delegate->fillBreaks(text, rangeStart, rangeEnd, foundBreaks.getBuffer()+beforeSize,
additionalCapacity, status);
if (U_FAILURE(status)) return 0;
foundBreaks.setSize(beforeSize + result);
utext_setNativeIndex(text, current);
return result;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View file

@ -10,6 +10,7 @@
#ifndef BRKENG_H
#define BRKENG_H
#include "unicode/umisc.h"
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/utext.h"
@ -21,6 +22,7 @@ class UnicodeSet;
class UStack;
class UVector32;
class DictionaryMatcher;
class ExternalBreakEngine;
/*******************************************************************
* LanguageBreakEngine
@ -35,7 +37,7 @@ class DictionaryMatcher;
* <p>LanguageBreakEngines should normally be implemented so as to
* be shared between threads without locking.</p>
*/
class LanguageBreakEngine : public UMemory {
class LanguageBreakEngine : public UObject {
public:
/**
@ -54,10 +56,11 @@ class LanguageBreakEngine : public UMemory {
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param locale The locale.
* @return true if this engine handles the particular character and break
* type.
*/
virtual UBool handles(UChar32 c) const = 0;
virtual UBool handles(UChar32 c, const char* locale) const = 0;
/**
* <p>Find any breaks within a run in the supplied text.</p>
@ -80,6 +83,35 @@ class LanguageBreakEngine : public UMemory {
};
/*******************************************************************
* BreakEngineWrapper
*/
/**
* <p>BreakEngineWrapper implement LanguageBreakEngine by
* a thin wrapper that delegate the task to ExternalBreakEngine
* </p>
*/
class BreakEngineWrapper : public LanguageBreakEngine {
public:
BreakEngineWrapper(ExternalBreakEngine* engine, UErrorCode &status);
virtual ~BreakEngineWrapper();
virtual UBool handles(UChar32 c, const char* locale) const override;
virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
UBool isPhraseBreaking,
UErrorCode &status) const override;
private:
LocalPointer<ExternalBreakEngine> delegate;
};
/*******************************************************************
* LanguageBreakFactory
*/
@ -125,9 +157,10 @@ class LanguageBreakFactory : public UMemory {
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
* @param locale The locale.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0;
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) = 0;
};
@ -174,10 +207,11 @@ class UnhandledEngine : public LanguageBreakEngine {
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param locale The locale.
* @return true if this engine handles the particular character and break
* type.
*/
virtual UBool handles(UChar32 c) const override;
virtual UBool handles(UChar32 c, const char* locale) const override;
/**
* <p>Find any breaks within a run in the supplied text.</p>
@ -247,9 +281,18 @@ class ICULanguageBreakFactory : public LanguageBreakFactory {
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
* @param locale The locale.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
virtual const LanguageBreakEngine *getEngineFor(UChar32 c) override;
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) override;
/**
* Add and adopt the engine and return an URegistryKey.
* @param engine The ExternalBreakEngine to be added and adopt. The caller
* pass the ownership and should not release the memory after this.
* @param status the error code.
*/
virtual void addExternalEngine(ExternalBreakEngine* engine, UErrorCode& status);
protected:
/**
@ -258,9 +301,10 @@ protected:
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
* @param locale The locale.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
virtual const LanguageBreakEngine *loadEngineFor(UChar32 c);
virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, const char* locale);
/**
* <p>Create a DictionaryMatcher for the specified script and break type.</p>
@ -269,6 +313,9 @@ protected:
* @return A DictionaryMatcher with the desired characteristics, or nullptr.
*/
virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
private:
void ensureEngines(UErrorCode& status);
};
U_NAMESPACE_END

View file

@ -27,6 +27,7 @@
#include "unicode/rbbi.h"
#include "unicode/brkiter.h"
#include "unicode/udata.h"
#include "unicode/uloc.h"
#include "unicode/ures.h"
#include "unicode/ustring.h"
#include "unicode/filteredbrk.h"
@ -121,8 +122,11 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st
// If there is a result, set the valid locale and actual locale, and the kind
if (U_SUCCESS(status) && result != nullptr) {
U_LOCALE_BASED(locBased, *(BreakIterator*)result);
locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status),
actualLocale.data());
uprv_strncpy(result->requestLocale, loc.getName(), ULOC_FULLNAME_CAPACITY);
result->requestLocale[ULOC_FULLNAME_CAPACITY-1] = 0; // always terminate
}
ures_close(b);
@ -202,18 +206,20 @@ BreakIterator::getAvailableLocales(int32_t& count)
BreakIterator::BreakIterator()
{
*validLocale = *actualLocale = 0;
*validLocale = *actualLocale = *requestLocale = 0;
}
BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) {
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
}
BreakIterator &BreakIterator::operator =(const BreakIterator &other) {
if (this != &other) {
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
}
return *this;
}
@ -493,12 +499,18 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
Locale
BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
if (type == ULOC_REQUESTED_LOCALE) {
return Locale(requestLocale);
}
U_LOCALE_BASED(locBased, *this);
return locBased.getLocale(type, status);
}
const char *
BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
if (type == ULOC_REQUESTED_LOCALE) {
return requestLocale;
}
U_LOCALE_BASED(locBased, *this);
return locBased.getLocaleID(type, status);
}

View file

@ -169,7 +169,7 @@ void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
case UPROPS_SRC_INPC:
case UPROPS_SRC_INSC:
case UPROPS_SRC_VO:
uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
uprops_addPropertyStarts(src, &sa, &errorCode);
break;
case UPROPS_SRC_EMOJI: {
const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
@ -178,6 +178,14 @@ void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
}
break;
}
case UPROPS_SRC_IDSU:
// New in Unicode 15.1 for just two characters.
sa.add(sa.set, 0x2FFE);
sa.add(sa.set, 0x2FFF + 1);
break;
case UPROPS_SRC_ID_COMPAT_MATH:
uprops_addPropertyStarts(src, &sa, &errorCode);
break;
default:
errorCode = U_INTERNAL_PROGRAM_ERROR;
break;

View file

@ -42,7 +42,7 @@ DictionaryBreakEngine::~DictionaryBreakEngine() {
}
UBool
DictionaryBreakEngine::handles(UChar32 c) const {
DictionaryBreakEngine::handles(UChar32 c, const char*) const {
return fSet.contains(c);
}
@ -54,13 +54,13 @@ DictionaryBreakEngine::findBreaks( UText *text,
UBool isPhraseBreaking,
UErrorCode& status) const {
if (U_FAILURE(status)) return 0;
(void)startPos; // TODO: remove this param?
int32_t result = 0;
// Find the span of characters included in the set.
// The span to break begins at the current position in the text, and
// extends towards the start or end of the text, depending on 'reverse'.
utext_setNativeIndex(text, startPos);
int32_t start = (int32_t)utext_getNativeIndex(text);
int32_t current;
int32_t rangeStart;

View file

@ -62,10 +62,11 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param locale The locale.
* @return true if this engine handles the particular character and break
* type.
*/
virtual UBool handles(UChar32 c) const override;
virtual UBool handles(UChar32 c, const char* locale) const override;
/**
* <p>Find any breaks within a run in the supplied text.</p>

View file

@ -143,6 +143,9 @@ static icu::UInitOnce nfkcInitOnce {};
static Norm2AllModes *nfkc_cfSingleton;
static icu::UInitOnce nfkc_cfInitOnce {};
static Norm2AllModes *nfkc_scfSingleton;
static icu::UInitOnce nfkc_scfInitOnce {};
static UHashtable *cache=nullptr;
// UInitOnce singleton initialization function
@ -156,6 +159,8 @@ static void U_CALLCONV initSingletons(const char *what, UErrorCode &errorCode) {
nfkcSingleton = Norm2AllModes::createInstance(nullptr, "nfkc", errorCode);
} else if (uprv_strcmp(what, "nfkc_cf") == 0) {
nfkc_cfSingleton = Norm2AllModes::createInstance(nullptr, "nfkc_cf", errorCode);
} else if (uprv_strcmp(what, "nfkc_scf") == 0) {
nfkc_scfSingleton = Norm2AllModes::createInstance(nullptr, "nfkc_scf", errorCode);
} else {
UPRV_UNREACHABLE_EXIT; // Unknown singleton
}
@ -183,6 +188,10 @@ static UBool U_CALLCONV uprv_loaded_normalizer2_cleanup() {
nfkc_cfSingleton = nullptr;
nfkc_cfInitOnce.reset();
delete nfkc_scfSingleton;
nfkc_scfSingleton = nullptr;
nfkc_scfInitOnce.reset();
uhash_close(cache);
cache=nullptr;
return true;
@ -213,6 +222,13 @@ Norm2AllModes::getNFKC_CFInstance(UErrorCode &errorCode) {
return nfkc_cfSingleton;
}
const Norm2AllModes *
Norm2AllModes::getNFKC_SCFInstance(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return nullptr; }
umtx_initOnce(nfkc_scfInitOnce, &initSingletons, "nfkc_scf", errorCode);
return nfkc_scfSingleton;
}
#if !NORM2_HARDCODE_NFC_DATA
const Normalizer2 *
Normalizer2::getNFCInstance(UErrorCode &errorCode) {
@ -261,6 +277,12 @@ Normalizer2::getNFKCCasefoldInstance(UErrorCode &errorCode) {
return allModes!=nullptr ? &allModes->comp : nullptr;
}
const Normalizer2 *
Normalizer2::getNFKCSimpleCasefoldInstance(UErrorCode &errorCode) {
const Norm2AllModes *allModes=Norm2AllModes::getNFKC_SCFInstance(errorCode);
return allModes!=nullptr ? &allModes->comp : nullptr;
}
const Normalizer2 *
Normalizer2::getInstance(const char *packageName,
const char *name,
@ -281,6 +303,8 @@ Normalizer2::getInstance(const char *packageName,
allModes=Norm2AllModes::getNFKCInstance(errorCode);
} else if(0==uprv_strcmp(name, "nfkc_cf")) {
allModes=Norm2AllModes::getNFKC_CFInstance(errorCode);
} else if(0==uprv_strcmp(name, "nfkc_scf")) {
allModes=Norm2AllModes::getNFKC_SCFInstance(errorCode);
}
}
if(allModes==nullptr && U_SUCCESS(errorCode)) {
@ -393,6 +417,11 @@ unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode) {
return (const UNormalizer2 *)Normalizer2::getNFKCCasefoldInstance(*pErrorCode);
}
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFKCSimpleCasefoldInstance(UErrorCode *pErrorCode) {
return (const UNormalizer2 *)Normalizer2::getNFKCSimpleCasefoldInstance(*pErrorCode);
}
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getInstance(const char *packageName,
const char *name,

File diff suppressed because it is too large Load diff

View file

@ -307,7 +307,7 @@ LSR getMaximalLsrOrUnd(const XLikelySubtags &likelySubtags, const Locale &locale
if (U_FAILURE(errorCode) || locale.isBogus() || *locale.getName() == 0 /* "und" */) {
return UND_LSR;
} else {
return likelySubtags.makeMaximizedLsrFrom(locale, errorCode);
return likelySubtags.makeMaximizedLsrFrom(locale, false, errorCode);
}
}

View file

@ -563,7 +563,7 @@ private:
LocalMemory<int32_t>& replacementIndexes,
int32_t &length,
void (*checkType)(const char* type),
void (*checkReplacement)(const UnicodeString& replacement),
void (*checkReplacement)(const UChar* replacement),
UErrorCode &status);
// Read the languageAlias data from alias to
@ -700,7 +700,7 @@ AliasDataBuilder::readAlias(
LocalMemory<int32_t>& replacementIndexes,
int32_t &length,
void (*checkType)(const char* type),
void (*checkReplacement)(const UnicodeString& replacement),
void (*checkReplacement)(const UChar* replacement),
UErrorCode &status) {
if (U_FAILURE(status)) {
return;
@ -720,8 +720,8 @@ AliasDataBuilder::readAlias(
LocalUResourceBundlePointer res(
ures_getNextResource(alias, nullptr, &status));
const char* aliasFrom = ures_getKey(res.getAlias());
UnicodeString aliasTo =
ures_getUnicodeStringByKey(res.getAlias(), "replacement", &status);
const UChar* aliasTo =
ures_getStringByKey(res.getAlias(), "replacement", nullptr, &status);
if (U_FAILURE(status)) return;
checkType(aliasFrom);
@ -766,7 +766,7 @@ AliasDataBuilder::readLanguageAlias(
#else
[](const char*) {},
#endif
[](const UnicodeString&) {}, status);
[](const UChar*) {}, status);
}
/**
@ -790,12 +790,12 @@ AliasDataBuilder::readScriptAlias(
[](const char* type) {
U_ASSERT(uprv_strlen(type) == 4);
},
[](const UnicodeString& replacement) {
U_ASSERT(replacement.length() == 4);
[](const UChar* replacement) {
U_ASSERT(u_strlen(replacement) == 4);
},
#else
[](const char*) {},
[](const UnicodeString&) { },
[](const UChar*) { },
#endif
status);
}
@ -824,7 +824,7 @@ AliasDataBuilder::readTerritoryAlias(
#else
[](const char*) {},
#endif
[](const UnicodeString&) { },
[](const UChar*) { },
status);
}
@ -851,15 +851,16 @@ AliasDataBuilder::readVariantAlias(
U_ASSERT(uprv_strlen(type) != 4 ||
(type[0] >= '0' && type[0] <= '9'));
},
[](const UnicodeString& replacement) {
U_ASSERT(replacement.length() >= 4 && replacement.length() <= 8);
U_ASSERT(replacement.length() != 4 ||
(replacement.charAt(0) >= u'0' &&
replacement.charAt(0) <= u'9'));
[](const UChar* replacement) {
int32_t len = u_strlen(replacement);
U_ASSERT(len >= 4 && len <= 8);
U_ASSERT(len != 4 ||
(*replacement >= u'0' &&
*replacement <= u'9'));
},
#else
[](const char*) {},
[](const UnicodeString&) { },
[](const UChar*) { },
#endif
status);
}
@ -888,7 +889,7 @@ AliasDataBuilder::readSubdivisionAlias(
#else
[](const char*) {},
#endif
[](const UnicodeString&) { },
[](const UChar*) { },
status);
}
@ -1066,7 +1067,13 @@ class AliasReplacer {
public:
AliasReplacer(UErrorCode status) :
language(nullptr), script(nullptr), region(nullptr),
extensions(nullptr), variants(status),
extensions(nullptr),
// store value in variants only once
variants(nullptr,
([](UElement e1, UElement e2) -> UBool {
return 0==uprv_strcmp((const char*)e1.pointer,
(const char*)e2.pointer);}),
status),
data(nullptr) {
}
~AliasReplacer() {
@ -1652,10 +1659,16 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status
while ((end = uprv_strchr(start, SEP_CHAR)) != nullptr &&
U_SUCCESS(status)) {
*end = NULL_CHAR; // null terminate inside variantsBuff
variants.addElement(start, status);
// do not add "" or duplicate data to variants
if (*start && !variants.contains(start)) {
variants.addElement(start, status);
}
start = end + 1;
}
variants.addElement(start, status);
// do not add "" or duplicate data to variants
if (*start && !variants.contains(start)) {
variants.addElement(start, status);
}
}
if (U_FAILURE(status)) { return false; }
@ -2079,6 +2092,10 @@ Locale::addLikelySubtags(UErrorCode& status) {
void
Locale::minimizeSubtags(UErrorCode& status) {
Locale::minimizeSubtags(false, status);
}
void
Locale::minimizeSubtags(bool favorScript, UErrorCode& status) {
if (U_FAILURE(status)) {
return;
}
@ -2086,7 +2103,7 @@ Locale::minimizeSubtags(UErrorCode& status) {
CharString minimizedLocaleID;
{
CharStringByteSink sink(&minimizedLocaleID);
ulocimp_minimizeSubtags(fullName, sink, &status);
ulocimp_minimizeSubtags(fullName, sink, favorScript, &status);
}
if (U_FAILURE(status)) {
@ -2402,8 +2419,9 @@ Locale::getLocaleCache()
}
class KeywordEnumeration : public StringEnumeration {
private:
protected:
char *keywords;
private:
char *current;
int32_t length;
UnicodeString currUSKey;
@ -2510,6 +2528,17 @@ public:
if (resultLength != nullptr) *resultLength = 0;
return nullptr;
}
virtual int32_t count(UErrorCode &/*status*/) const override {
char *kw = keywords;
int32_t result = 0;
while(*kw) {
if (uloc_toUnicodeLocaleKey(kw) != nullptr) {
result++;
}
kw += uprv_strlen(kw)+1;
}
return result;
}
};
// Out-of-line virtual destructor to serve as the "key function".

View file

@ -31,82 +31,10 @@
#include "charstr.h"
#include "cmemory.h"
#include "cstring.h"
#include "loclikelysubtags.h"
#include "ulocimp.h"
#include "ustr_imp.h"
/**
* These are the canonical strings for unknown languages, scripts and regions.
**/
static const char* const unknownLanguage = "und";
static const char* const unknownScript = "Zzzz";
static const char* const unknownRegion = "ZZ";
/**
* This function looks for the localeID in the likelySubtags resource.
*
* @param localeID The tag to find.
* @param buffer A buffer to hold the matching entry
* @param bufferLength The length of the output buffer
* @return A pointer to "buffer" if found, or a null pointer if not.
*/
static const char* U_CALLCONV
findLikelySubtags(const char* localeID,
char* buffer,
int32_t bufferLength,
UErrorCode* err) {
const char* result = nullptr;
if (!U_FAILURE(*err)) {
int32_t resLen = 0;
const char16_t* s = nullptr;
UErrorCode tmpErr = U_ZERO_ERROR;
icu::LocalUResourceBundlePointer subtags(ures_openDirect(nullptr, "likelySubtags", &tmpErr));
if (U_SUCCESS(tmpErr)) {
icu::CharString und;
if (localeID != nullptr) {
if (*localeID == '\0') {
localeID = unknownLanguage;
} else if (*localeID == '_') {
und.append(unknownLanguage, *err);
und.append(localeID, *err);
if (U_FAILURE(*err)) {
return nullptr;
}
localeID = und.data();
}
}
s = ures_getStringByKey(subtags.getAlias(), localeID, &resLen, &tmpErr);
if (U_FAILURE(tmpErr)) {
/*
* If a resource is missing, it's not really an error, it's
* just that we don't have any data for that particular locale ID.
*/
if (tmpErr != U_MISSING_RESOURCE_ERROR) {
*err = tmpErr;
}
}
else if (resLen >= bufferLength) {
/* The buffer should never overflow. */
*err = U_INTERNAL_PROGRAM_ERROR;
}
else {
u_UCharsToChars(s, buffer, resLen + 1);
if (resLen >= 3 &&
uprv_strnicmp(buffer, unknownLanguage, 3) == 0 &&
(resLen == 3 || buffer[3] == '_')) {
uprv_memmove(buffer, buffer + 3, resLen - 3 + 1);
}
result = buffer;
}
} else {
*err = tmpErr;
}
}
return result;
}
/**
* Append a tag to a buffer, adding the separator if necessary. The buffer
* must be large enough to contain the resulting tag plus any separator
@ -360,57 +288,6 @@ error:
}
}
/**
* Create a tag string from the supplied parameters. The lang, script and region
* parameters may be nullptr pointers. If they are, their corresponding length parameters
* must be less than or equal to 0. If the lang parameter is an empty string, the
* default value for an unknown language is written to the output buffer.
*
* If the length of the new string exceeds the capacity of the output buffer,
* the function copies as many bytes to the output buffer as it can, and returns
* the error U_BUFFER_OVERFLOW_ERROR.
*
* If an illegal argument is provided, the function returns the error
* U_ILLEGAL_ARGUMENT_ERROR.
*
* @param lang The language tag to use.
* @param langLength The length of the language tag.
* @param script The script tag to use.
* @param scriptLength The length of the script tag.
* @param region The region tag to use.
* @param regionLength The length of the region tag.
* @param trailing Any trailing data to append to the new tag.
* @param trailingLength The length of the trailing data.
* @param sink The output sink receiving the tag string.
* @param err A pointer to a UErrorCode for error reporting.
**/
static void U_CALLCONV
createTagString(
const char* lang,
int32_t langLength,
const char* script,
int32_t scriptLength,
const char* region,
int32_t regionLength,
const char* trailing,
int32_t trailingLength,
icu::ByteSink& sink,
UErrorCode* err)
{
createTagStringWithAlternates(
lang,
langLength,
script,
scriptLength,
region,
regionLength,
trailing,
trailingLength,
nullptr,
sink,
err);
}
/**
* Parse the language, script, and region subtags from a tag string, and copy the
* results into the corresponding output parameters. The buffers are null-terminated,
@ -494,13 +371,6 @@ parseTagString(
*scriptLength = subtagLength;
if (*scriptLength > 0) {
if (uprv_strnicmp(script, unknownScript, *scriptLength) == 0) {
/**
* If the script part is the "unknown" script, then don't return it.
**/
*scriptLength = 0;
}
/*
* Move past any separator.
*/
@ -517,14 +387,7 @@ parseTagString(
*regionLength = subtagLength;
if (*regionLength > 0) {
if (uprv_strnicmp(region, unknownRegion, *regionLength) == 0) {
/**
* If the region part is the "unknown" region, then don't return it.
**/
*regionLength = 0;
}
} else if (*position != 0 && *position != '@') {
if (*regionLength <= 0 && *position != 0 && *position != '@') {
/* back up over consumed trailing separator */
--position;
}
@ -546,264 +409,6 @@ error:
goto exit;
}
static UBool U_CALLCONV
createLikelySubtagsString(
const char* lang,
int32_t langLength,
const char* script,
int32_t scriptLength,
const char* region,
int32_t regionLength,
const char* variants,
int32_t variantsLength,
icu::ByteSink& sink,
UErrorCode* err) {
/**
* ULOC_FULLNAME_CAPACITY will provide enough capacity
* that we can build a string that contains the language,
* script and region code without worrying about overrunning
* the user-supplied buffer.
**/
char likelySubtagsBuffer[ULOC_FULLNAME_CAPACITY];
if(U_FAILURE(*err)) {
goto error;
}
/**
* Try the language with the script and region first.
**/
if (scriptLength > 0 && regionLength > 0) {
const char* likelySubtags = nullptr;
icu::CharString tagBuffer;
{
icu::CharStringByteSink sink(&tagBuffer);
createTagString(
lang,
langLength,
script,
scriptLength,
region,
regionLength,
nullptr,
0,
sink,
err);
}
if(U_FAILURE(*err)) {
goto error;
}
likelySubtags =
findLikelySubtags(
tagBuffer.data(),
likelySubtagsBuffer,
sizeof(likelySubtagsBuffer),
err);
if(U_FAILURE(*err)) {
goto error;
}
if (likelySubtags != nullptr) {
/* Always use the language tag from the
maximal string, since it may be more
specific than the one provided. */
createTagStringWithAlternates(
nullptr,
0,
nullptr,
0,
nullptr,
0,
variants,
variantsLength,
likelySubtags,
sink,
err);
return true;
}
}
/**
* Try the language with just the script.
**/
if (scriptLength > 0) {
const char* likelySubtags = nullptr;
icu::CharString tagBuffer;
{
icu::CharStringByteSink sink(&tagBuffer);
createTagString(
lang,
langLength,
script,
scriptLength,
nullptr,
0,
nullptr,
0,
sink,
err);
}
if(U_FAILURE(*err)) {
goto error;
}
likelySubtags =
findLikelySubtags(
tagBuffer.data(),
likelySubtagsBuffer,
sizeof(likelySubtagsBuffer),
err);
if(U_FAILURE(*err)) {
goto error;
}
if (likelySubtags != nullptr) {
/* Always use the language tag from the
maximal string, since it may be more
specific than the one provided. */
createTagStringWithAlternates(
nullptr,
0,
nullptr,
0,
region,
regionLength,
variants,
variantsLength,
likelySubtags,
sink,
err);
return true;
}
}
/**
* Try the language with just the region.
**/
if (regionLength > 0) {
const char* likelySubtags = nullptr;
icu::CharString tagBuffer;
{
icu::CharStringByteSink sink(&tagBuffer);
createTagString(
lang,
langLength,
nullptr,
0,
region,
regionLength,
nullptr,
0,
sink,
err);
}
if(U_FAILURE(*err)) {
goto error;
}
likelySubtags =
findLikelySubtags(
tagBuffer.data(),
likelySubtagsBuffer,
sizeof(likelySubtagsBuffer),
err);
if(U_FAILURE(*err)) {
goto error;
}
if (likelySubtags != nullptr) {
/* Always use the language tag from the
maximal string, since it may be more
specific than the one provided. */
createTagStringWithAlternates(
nullptr,
0,
script,
scriptLength,
nullptr,
0,
variants,
variantsLength,
likelySubtags,
sink,
err);
return true;
}
}
/**
* Finally, try just the language.
**/
{
const char* likelySubtags = nullptr;
icu::CharString tagBuffer;
{
icu::CharStringByteSink sink(&tagBuffer);
createTagString(
lang,
langLength,
nullptr,
0,
nullptr,
0,
nullptr,
0,
sink,
err);
}
if(U_FAILURE(*err)) {
goto error;
}
likelySubtags =
findLikelySubtags(
tagBuffer.data(),
likelySubtagsBuffer,
sizeof(likelySubtagsBuffer),
err);
if(U_FAILURE(*err)) {
goto error;
}
if (likelySubtags != nullptr) {
/* Always use the language tag from the
maximal string, since it may be more
specific than the one provided. */
createTagStringWithAlternates(
nullptr,
0,
script,
scriptLength,
region,
regionLength,
variants,
variantsLength,
likelySubtags,
sink,
err);
return true;
}
}
return false;
error:
if (!U_FAILURE(*err)) {
*err = U_ILLEGAL_ARGUMENT_ERROR;
}
return false;
}
#define CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength) UPRV_BLOCK_MACRO_BEGIN { \
int32_t count = 0; \
int32_t i; \
@ -836,7 +441,6 @@ _uloc_addLikelySubtags(const char* localeID,
const char* trailing = "";
int32_t trailingLength = 0;
int32_t trailingIndex = 0;
UBool success = false;
if(U_FAILURE(*err)) {
goto error;
@ -862,6 +466,9 @@ _uloc_addLikelySubtags(const char* localeID,
goto error;
}
if (langLength > 3) {
goto error;
}
/* Find the length of the trailing portion. */
while (_isIDSeparator(localeID[trailingIndex])) {
@ -871,30 +478,42 @@ _uloc_addLikelySubtags(const char* localeID,
trailingLength = (int32_t)uprv_strlen(trailing);
CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
success =
createLikelySubtagsString(
lang,
langLength,
script,
scriptLength,
region,
regionLength,
{
const icu::XLikelySubtags* likelySubtags = icu::XLikelySubtags::getSingleton(*err);
if(U_FAILURE(*err)) {
goto error;
}
// We need to keep l on the stack because lsr may point into internal
// memory of l.
icu::Locale l = icu::Locale::createFromName(localeID);
if (l.isBogus()) {
goto error;
}
icu::LSR lsr = likelySubtags->makeMaximizedLsrFrom(l, true, *err);
if(U_FAILURE(*err)) {
goto error;
}
const char* language = lsr.language;
if (uprv_strcmp(language, "und") == 0) {
language = "";
}
createTagStringWithAlternates(
language,
(int32_t)uprv_strlen(language),
lsr.script,
(int32_t)uprv_strlen(lsr.script),
lsr.region,
(int32_t)uprv_strlen(lsr.region),
trailing,
trailingLength,
nullptr,
sink,
err);
if (!success) {
const int32_t localIDLength = (int32_t)uprv_strlen(localeID);
/*
* If we get here, we need to return localeID.
*/
sink.Append(localeID, localIDLength);
if(U_FAILURE(*err)) {
goto error;
}
}
return success;
return true;
error:
@ -913,6 +532,7 @@ static UBool _ulocimp_addLikelySubtags(const char*, icu::ByteSink&, UErrorCode*)
static void
_uloc_minimizeSubtags(const char* localeID,
icu::ByteSink& sink,
bool favorScript,
UErrorCode* err) {
icu::CharString maximizedTagBuffer;
@ -925,7 +545,6 @@ _uloc_minimizeSubtags(const char* localeID,
const char* trailing = "";
int32_t trailingLength = 0;
int32_t trailingIndex = 0;
UBool successGetMax = false;
if(U_FAILURE(*err)) {
goto error;
@ -964,213 +583,38 @@ _uloc_minimizeSubtags(const char* localeID,
CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
{
icu::CharString base;
{
icu::CharStringByteSink baseSink(&base);
createTagString(
lang,
langLength,
script,
scriptLength,
region,
regionLength,
nullptr,
0,
baseSink,
err);
}
/**
* First, we need to first get the maximization
* from AddLikelySubtags.
**/
{
icu::CharStringByteSink maxSink(&maximizedTagBuffer);
successGetMax = _ulocimp_addLikelySubtags(base.data(), maxSink, err);
}
}
if(U_FAILURE(*err)) {
goto error;
}
if (!successGetMax) {
/**
* If we got here, return the locale ID parameter unchanged.
**/
const int32_t localeIDLength = (int32_t)uprv_strlen(localeID);
sink.Append(localeID, localeIDLength);
return;
}
// In the following, the lang, script, region are referring to those in
// the maximizedTagBuffer, not the one in the localeID.
langLength = sizeof(lang);
scriptLength = sizeof(script);
regionLength = sizeof(region);
parseTagString(
maximizedTagBuffer.data(),
lang,
&langLength,
script,
&scriptLength,
region,
&regionLength,
err);
if(U_FAILURE(*err)) {
goto error;
}
/**
* Start first with just the language.
**/
{
icu::CharString tagBuffer;
{
icu::CharStringByteSink tagSink(&tagBuffer);
createLikelySubtagsString(
lang,
langLength,
nullptr,
0,
nullptr,
0,
nullptr,
0,
tagSink,
err);
}
const icu::XLikelySubtags* likelySubtags = icu::XLikelySubtags::getSingleton(*err);
if(U_FAILURE(*err)) {
goto error;
}
else if (!tagBuffer.isEmpty() &&
uprv_strnicmp(
maximizedTagBuffer.data(),
tagBuffer.data(),
tagBuffer.length()) == 0) {
createTagString(
lang,
langLength,
nullptr,
0,
nullptr,
0,
trailing,
trailingLength,
sink,
err);
return;
}
}
/**
* Next, try the language and region.
**/
if (regionLength > 0) {
icu::CharString tagBuffer;
{
icu::CharStringByteSink tagSink(&tagBuffer);
createLikelySubtagsString(
lang,
langLength,
nullptr,
0,
region,
regionLength,
nullptr,
0,
tagSink,
err);
}
icu::LSR lsr = likelySubtags->minimizeSubtags(
{lang, langLength},
{script, scriptLength},
{region, regionLength},
favorScript,
*err);
if(U_FAILURE(*err)) {
goto error;
}
else if (!tagBuffer.isEmpty() &&
uprv_strnicmp(
maximizedTagBuffer.data(),
tagBuffer.data(),
tagBuffer.length()) == 0) {
createTagString(
lang,
langLength,
nullptr,
0,
region,
regionLength,
trailing,
trailingLength,
sink,
err);
return;
const char* language = lsr.language;
if (uprv_strcmp(language, "und") == 0) {
language = "";
}
}
/**
* Finally, try the language and script. This is our last chance,
* since trying with all three subtags would only yield the
* maximal version that we already have.
**/
if (scriptLength > 0) {
icu::CharString tagBuffer;
{
icu::CharStringByteSink tagSink(&tagBuffer);
createLikelySubtagsString(
lang,
langLength,
script,
scriptLength,
nullptr,
0,
nullptr,
0,
tagSink,
err);
}
createTagStringWithAlternates(
language,
(int32_t)uprv_strlen(language),
lsr.script,
(int32_t)uprv_strlen(lsr.script),
lsr.region,
(int32_t)uprv_strlen(lsr.region),
trailing,
trailingLength,
nullptr,
sink,
err);
if(U_FAILURE(*err)) {
goto error;
}
else if (!tagBuffer.isEmpty() &&
uprv_strnicmp(
maximizedTagBuffer.data(),
tagBuffer.data(),
tagBuffer.length()) == 0) {
createTagString(
lang,
langLength,
script,
scriptLength,
nullptr,
0,
trailing,
trailingLength,
sink,
err);
return;
}
}
{
/**
* If we got here, return the max + trail.
**/
createTagString(
lang,
langLength,
script,
scriptLength,
region,
regionLength,
trailing,
trailingLength,
sink,
err);
return;
}
@ -1181,31 +625,6 @@ error:
}
}
static int32_t
do_canonicalize(const char* localeID,
char* buffer,
int32_t bufferCapacity,
UErrorCode* err)
{
int32_t canonicalizedSize = uloc_canonicalize(
localeID,
buffer,
bufferCapacity,
err);
if (*err == U_STRING_NOT_TERMINATED_WARNING ||
*err == U_BUFFER_OVERFLOW_ERROR) {
return canonicalizedSize;
}
else if (U_FAILURE(*err)) {
return -1;
}
else {
return canonicalizedSize;
}
}
U_CAPI int32_t U_EXPORT2
uloc_addLikelySubtags(const char* localeID,
char* maximizedLocaleID,
@ -1239,14 +658,13 @@ static UBool
_ulocimp_addLikelySubtags(const char* localeID,
icu::ByteSink& sink,
UErrorCode* status) {
PreflightingLocaleIDBuffer localeBuffer;
do {
localeBuffer.requestedCapacity = do_canonicalize(localeID, localeBuffer.getBuffer(),
localeBuffer.getCapacity(), status);
} while (localeBuffer.needToTryAgain(status));
icu::CharString localeBuffer;
{
icu::CharStringByteSink localeSink(&localeBuffer);
ulocimp_canonicalize(localeID, localeSink, status);
}
if (U_SUCCESS(*status)) {
return _uloc_addLikelySubtags(localeBuffer.getBuffer(), sink, status);
return _uloc_addLikelySubtags(localeBuffer.data(), sink, status);
} else {
return false;
}
@ -1271,7 +689,7 @@ uloc_minimizeSubtags(const char* localeID,
icu::CheckedArrayByteSink sink(
minimizedLocaleID, minimizedLocaleIDCapacity);
ulocimp_minimizeSubtags(localeID, sink, status);
ulocimp_minimizeSubtags(localeID, sink, false, status);
int32_t reslen = sink.NumberOfBytesAppended();
if (U_FAILURE(*status)) {
@ -1291,14 +709,14 @@ uloc_minimizeSubtags(const char* localeID,
U_CAPI void U_EXPORT2
ulocimp_minimizeSubtags(const char* localeID,
icu::ByteSink& sink,
bool favorScript,
UErrorCode* status) {
PreflightingLocaleIDBuffer localeBuffer;
do {
localeBuffer.requestedCapacity = do_canonicalize(localeID, localeBuffer.getBuffer(),
localeBuffer.getCapacity(), status);
} while (localeBuffer.needToTryAgain(status));
_uloc_minimizeSubtags(localeBuffer.getBuffer(), sink, status);
icu::CharString localeBuffer;
{
icu::CharStringByteSink localeSink(&localeBuffer);
ulocimp_canonicalize(localeID, localeSink, status);
}
_uloc_minimizeSubtags(localeBuffer.data(), sink, favorScript, status);
}
// Pairs of (language subtag, + or -) for finding out fast if common languages
@ -1374,16 +792,26 @@ ulocimp_getRegionForSupplementalData(const char *localeID, UBool inferRegion,
UErrorCode rgStatus = U_ZERO_ERROR;
// First check for rg keyword value
int32_t rgLen = uloc_getKeywordValue(localeID, "rg", rgBuf, ULOC_RG_BUFLEN, &rgStatus);
if (U_FAILURE(rgStatus) || rgLen != 6) {
icu::CharString rg;
{
icu::CharStringByteSink sink(&rg);
ulocimp_getKeywordValue(localeID, "rg", sink, &rgStatus);
}
int32_t rgLen = rg.length();
if (U_FAILURE(rgStatus) || rgLen < 3 || rgLen > 7) {
rgLen = 0;
} else {
// rgBuf guaranteed to be zero terminated here, with text len 6
char *rgPtr = rgBuf;
for (; *rgPtr!= 0; rgPtr++) {
*rgPtr = uprv_toupper(*rgPtr);
// chop off the subdivision code (which will generally be "zzzz" anyway)
const char* const data = rg.data();
if (uprv_isASCIILetter(data[0])) {
rgLen = 2;
rgBuf[0] = uprv_toupper(data[0]);
rgBuf[1] = uprv_toupper(data[1]);
} else {
// assume three-digit region code
rgLen = 3;
uprv_memcpy(rgBuf, data, rgLen);
}
rgLen = (uprv_strcmp(rgBuf+2, "ZZZZ") == 0)? 2: 0;
}
if (rgLen == 0) {

View file

@ -11,6 +11,7 @@
#include "unicode/locid.h"
#include "unicode/uobject.h"
#include "unicode/ures.h"
#include "unicode/uscript.h"
#include "charstr.h"
#include "cstring.h"
#include "loclikelysubtags.h"
@ -23,6 +24,7 @@
#include "uniquecharstr.h"
#include "uresdata.h"
#include "uresimp.h"
#include "uvector.h"
U_NAMESPACE_BEGIN
@ -81,11 +83,18 @@ struct XLikelySubtagsData {
// Read all strings in the resource bundle and convert them to invariant char *.
LocalMemory<int32_t> languageIndexes, regionIndexes, lsrSubtagIndexes;
int32_t languagesLength = 0, regionsLength = 0, lsrSubtagsLength = 0;
ResourceArray m49Array;
if (likelyTable.findValue("m49", value)) {
m49Array = value.getArray(errorCode);
} else {
errorCode = U_MISSING_RESOURCE_ERROR;
return;
}
if (!readStrings(likelyTable, "languageAliases", value,
languageIndexes, languagesLength, errorCode) ||
!readStrings(likelyTable, "regionAliases", value,
regionIndexes, regionsLength, errorCode) ||
!readStrings(likelyTable, "lsrs", value,
!readLSREncodedStrings(likelyTable, "lsrnum", value, m49Array,
lsrSubtagIndexes,lsrSubtagsLength, errorCode)) {
return;
}
@ -136,7 +145,7 @@ struct XLikelySubtagsData {
if (!readStrings(matchTable, "partitions", value,
partitionIndexes, partitionsLength, errorCode) ||
!readStrings(matchTable, "paradigms", value,
!readLSREncodedStrings(matchTable, "paradigmnum", value, m49Array,
paradigmSubtagIndexes, paradigmSubtagsLength, errorCode)) {
return;
}
@ -233,10 +242,96 @@ private:
return false;
}
for (int i = 0; i < length; ++i) {
stringArray.getValue(i, value); // returns true because i < length
rawIndexes[i] = strings.add(value.getUnicodeString(errorCode), errorCode);
if (stringArray.getValue(i, value)) { // returns true because i < length
int32_t strLength = 0;
rawIndexes[i] = strings.add(value.getString(strLength, errorCode), errorCode);
if (U_FAILURE(errorCode)) { return false; }
}
}
}
return true;
}
UnicodeString toLanguage(int encoded) {
if (encoded == 0) {
return UNICODE_STRING_SIMPLE("");
}
if (encoded == 1) {
return UNICODE_STRING_SIMPLE("skip");
}
encoded &= 0x00ffffff;
encoded %= 27*27*27;
char lang[3];
lang[0] = 'a' + ((encoded % 27) - 1);
lang[1] = 'a' + (((encoded / 27 ) % 27) - 1);
if (encoded / (27 * 27) == 0) {
return UnicodeString(lang, 2, US_INV);
}
lang[2] = 'a' + ((encoded / (27 * 27)) - 1);
return UnicodeString(lang, 3, US_INV);
}
UnicodeString toScript(int encoded) {
if (encoded == 0) {
return UNICODE_STRING_SIMPLE("");
}
if (encoded == 1) {
return UNICODE_STRING_SIMPLE("script");
}
encoded = (encoded >> 24) & 0x000000ff;
const char* script = uscript_getShortName(static_cast<UScriptCode>(encoded));
if (script == nullptr) {
return UNICODE_STRING_SIMPLE("");
}
U_ASSERT(uprv_strlen(script) == 4);
return UnicodeString(script, 4, US_INV);
}
UnicodeString m49IndexToCode(const ResourceArray &m49Array, ResourceValue &value, int index, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) {
return UNICODE_STRING_SIMPLE("");
}
if (m49Array.getValue(index, value)) {
return value.getUnicodeString(errorCode);
}
// "m49" does not include the index.
errorCode = U_MISSING_RESOURCE_ERROR;
return UNICODE_STRING_SIMPLE("");
}
UnicodeString toRegion(const ResourceArray& m49Array, ResourceValue &value, int encoded, UErrorCode &errorCode) {
if (encoded == 0 || encoded == 1) {
return UNICODE_STRING_SIMPLE("");
}
encoded &= 0x00ffffff;
encoded /= 27 * 27 * 27;
encoded %= 27 * 27;
if (encoded < 27) {
// Selected M49 code index, find the code from "m49" resource.
return m49IndexToCode(m49Array, value, encoded, errorCode);
}
char region[2];
region[0] = 'A' + ((encoded % 27) - 1);
region[1] = 'A' + (((encoded / 27) % 27) - 1);
return UnicodeString(region, 2, US_INV);
}
bool readLSREncodedStrings(const ResourceTable &table, const char* key, ResourceValue &value, const ResourceArray& m49Array,
LocalMemory<int32_t> &indexes, int32_t &length, UErrorCode &errorCode) {
if (table.findValue(key, value)) {
const int32_t* vectors = value.getIntVector(length, errorCode);
if (U_FAILURE(errorCode)) { return false; }
if (length == 0) { return true; }
int32_t *rawIndexes = indexes.allocateInsteadAndCopy(length * 3);
if (rawIndexes == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return false;
}
for (int i = 0; i < length; ++i) {
rawIndexes[i*3] = strings.addByValue(toLanguage(vectors[i]), errorCode);
rawIndexes[i*3+1] = strings.addByValue(toScript(vectors[i]), errorCode);
rawIndexes[i*3+2] = strings.addByValue(
toRegion(m49Array, value, vectors[i], errorCode), errorCode);
if (U_FAILURE(errorCode)) { return false; }
}
length *= 3;
}
return true;
}
@ -245,15 +340,52 @@ private:
namespace {
XLikelySubtags *gLikelySubtags = nullptr;
UVector *gMacroregions = nullptr;
UInitOnce gInitOnce {};
UBool U_CALLCONV cleanup() {
delete gLikelySubtags;
gLikelySubtags = nullptr;
delete gMacroregions;
gMacroregions = nullptr;
gInitOnce.reset();
return true;
}
static const char16_t RANGE_MARKER = 0x7E; /* '~' */
UVector* loadMacroregions(UErrorCode &status) {
LocalPointer<UVector> newMacroRegions(new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status), status);
LocalUResourceBundlePointer supplementalData(ures_openDirect(nullptr,"supplementalData",&status));
LocalUResourceBundlePointer idValidity(ures_getByKey(supplementalData.getAlias(),"idValidity",nullptr,&status));
LocalUResourceBundlePointer regionList(ures_getByKey(idValidity.getAlias(),"region",nullptr,&status));
LocalUResourceBundlePointer regionMacro(ures_getByKey(regionList.getAlias(),"macroregion",nullptr,&status));
if (U_FAILURE(status)) {
return nullptr;
}
while (U_SUCCESS(status) && ures_hasNext(regionMacro.getAlias())) {
UnicodeString regionName = ures_getNextUnicodeString(regionMacro.getAlias(),nullptr,&status);
int32_t rangeMarkerLocation = regionName.indexOf(RANGE_MARKER);
char16_t buf[6];
regionName.extract(buf,6,status);
if ( rangeMarkerLocation > 0 ) {
char16_t endRange = regionName.charAt(rangeMarkerLocation+1);
buf[rangeMarkerLocation] = 0;
while ( buf[rangeMarkerLocation-1] <= endRange && U_SUCCESS(status)) {
LocalPointer<UnicodeString> newRegion(new UnicodeString(buf), status);
newMacroRegions->adoptElement(newRegion.orphan(),status);
buf[rangeMarkerLocation-1]++;
}
} else {
LocalPointer<UnicodeString> newRegion(new UnicodeString(regionName), status);
newMacroRegions->adoptElement(newRegion.orphan(),status);
}
}
return newMacroRegions.orphan();
}
} // namespace
void U_CALLCONV XLikelySubtags::initLikelySubtags(UErrorCode &errorCode) {
@ -263,10 +395,14 @@ void U_CALLCONV XLikelySubtags::initLikelySubtags(UErrorCode &errorCode) {
data.load(errorCode);
if (U_FAILURE(errorCode)) { return; }
gLikelySubtags = new XLikelySubtags(data);
if (gLikelySubtags == nullptr) {
gMacroregions = loadMacroregions(errorCode);
if (U_FAILURE(errorCode) || gLikelySubtags == nullptr || gMacroregions == nullptr) {
delete gLikelySubtags;
delete gMacroregions;
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
ucln_common_registerCleanup(UCLN_COMMON_LIKELY_SUBTAGS, cleanup);
}
@ -317,15 +453,32 @@ XLikelySubtags::~XLikelySubtags() {
delete[] lsrs;
}
LSR XLikelySubtags::makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const {
LSR XLikelySubtags::makeMaximizedLsrFrom(const Locale &locale,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const {
if (locale.isBogus()) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return LSR("", "", "", LSR::EXPLICIT_LSR);
}
const char *name = locale.getName();
if (uprv_isAtSign(name[0]) && name[1] == 'x' && name[2] == '=') { // name.startsWith("@x=")
// Private use language tag x-subtag-subtag... which CLDR changes to
// und-x-subtag-subtag...
return LSR(name, "", "", LSR::EXPLICIT_LSR);
}
return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
locale.getVariant(), errorCode);
LSR max = makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
locale.getVariant(), returnInputIfUnmatch, errorCode);
if (uprv_strlen(max.language) == 0 &&
uprv_strlen(max.script) == 0 &&
uprv_strlen(max.region) == 0) {
// No match. ICU API mandate us to
// If the provided ULocale instance is already in the maximal form, or
// there is no data available available for maximization, it will be
// returned.
return LSR(locale.getLanguage(), locale.getScript(), locale.getCountry(), LSR::EXPLICIT_LSR, errorCode);
}
return max;
}
namespace {
@ -338,7 +491,9 @@ const char *getCanonical(const CharStringMap &aliases, const char *alias) {
} // namespace
LSR XLikelySubtags::makeMaximizedLsr(const char *language, const char *script, const char *region,
const char *variant, UErrorCode &errorCode) const {
const char *variant,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const {
// Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK.
// They should match only themselves,
// not other locales with what looks like the same language and script subtags.
@ -378,64 +533,91 @@ LSR XLikelySubtags::makeMaximizedLsr(const char *language, const char *script, c
language = getCanonical(languageAliases, language);
// (We have no script mappings.)
region = getCanonical(regionAliases, region);
return maximize(language, script, region);
return maximize(language, script, region, returnInputIfUnmatch, errorCode);
}
LSR XLikelySubtags::maximize(const char *language, const char *script, const char *region) const {
if (uprv_strcmp(language, "und") == 0) {
LSR XLikelySubtags::maximize(const char *language, const char *script, const char *region,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const {
return maximize({language, (int32_t)uprv_strlen(language)},
{script, (int32_t)uprv_strlen(script)},
{region, (int32_t)uprv_strlen(region)},
returnInputIfUnmatch,
errorCode);
}
bool XLikelySubtags::isMacroregion(StringPiece& region, UErrorCode& errorCode) const {
// In Java, we use Region class. In C++, since Region is under i18n,
// we read the same data used by Region into gMacroregions avoid dependency
// from common to i18n/region.cpp
if (U_FAILURE(errorCode)) { return false; }
umtx_initOnce(gInitOnce, &XLikelySubtags::initLikelySubtags, errorCode);
if (U_FAILURE(errorCode)) { return false; }
UnicodeString str(UnicodeString::fromUTF8(region));
return gMacroregions->contains((void *)&str);
}
LSR XLikelySubtags::maximize(StringPiece language, StringPiece script, StringPiece region,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) {
return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode);
}
if (language.compare("und") == 0) {
language = "";
}
if (uprv_strcmp(script, "Zzzz") == 0) {
if (script.compare("Zzzz") == 0) {
script = "";
}
if (uprv_strcmp(region, "ZZ") == 0) {
if (region.compare("ZZ") == 0) {
region = "";
}
if (*script != 0 && *region != 0 && *language != 0) {
return LSR(language, script, region, LSR::EXPLICIT_LSR); // already maximized
if (!script.empty() && !region.empty() && !language.empty()) {
return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode); // already maximized
}
bool retainLanguage = false;
bool retainScript = false;
bool retainRegion = false;
uint32_t retainOldMask = 0;
BytesTrie iter(trie);
uint64_t state;
int32_t value;
// Small optimization: Array lookup for first language letter.
int32_t c0;
if (0 <= (c0 = uprv_lowerOrdinal(language[0])) && c0 <= 25 &&
language[1] != 0 && // language.length() >= 2
if (0 <= (c0 = uprv_lowerOrdinal(language.data()[0])) && c0 <= 25 &&
language.length() >= 2 &&
(state = trieFirstLetterStates[c0]) != 0) {
value = trieNext(iter.resetToState64(state), language, 1);
} else {
value = trieNext(iter, language, 0);
}
bool matchLanguage = (value >= 0);
bool matchScript = false;
if (value >= 0) {
if (*language != 0) {
retainOldMask |= 4;
}
retainLanguage = !language.empty();
state = iter.getState64();
} else {
retainOldMask |= 4;
retainLanguage = true;
iter.resetToState64(trieUndState); // "und" ("*")
state = 0;
}
if (value >= 0 && !script.empty()) {
matchScript = true;
}
if (value > 0) {
// Intermediate or final value from just language.
if (value == SKIP_SCRIPT) {
value = 0;
}
if (*script != 0) {
retainOldMask |= 2;
}
retainScript = !script.empty();
} else {
value = trieNext(iter, script, 0);
if (value >= 0) {
if (*script != 0) {
retainOldMask |= 2;
}
retainScript = !script.empty();
state = iter.getState64();
} else {
retainOldMask |= 2;
retainScript = true;
if (state == 0) {
iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
} else {
@ -447,19 +629,19 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha
}
}
bool matchRegion = false;
if (value > 0) {
// Final value from just language or language+script.
if (*region != 0) {
retainOldMask |= 1;
}
retainRegion = !region.empty();
} else {
value = trieNext(iter, region, 0);
if (value >= 0) {
if (*region != 0) {
retainOldMask |= 1;
if (!region.empty() && !isMacroregion(region, errorCode)) {
retainRegion = true;
matchRegion = true;
}
} else {
retainOldMask |= 1;
retainRegion = true;
if (state == 0) {
value = defaultLsrIndex;
} else {
@ -470,28 +652,33 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha
}
}
U_ASSERT(value < lsrsLength);
const LSR &result = lsrs[value];
const LSR &matched = lsrs[value];
if (*language == 0) {
language = "und";
if (returnInputIfUnmatch &&
(!(matchLanguage || matchScript || (matchRegion && language.empty())))) {
return LSR("", "", "", LSR::EXPLICIT_LSR, errorCode); // no matching.
}
if (language.empty()) {
language = StringPiece("und");
}
if (retainOldMask == 0) {
if (!(retainLanguage || retainScript || retainRegion)) {
// Quickly return a copy of the lookup-result LSR
// without new allocation of the subtags.
return LSR(result.language, result.script, result.region, result.flags);
return LSR(matched.language, matched.script, matched.region, matched.flags);
}
if ((retainOldMask & 4) == 0) {
language = result.language;
if (!retainLanguage) {
language = matched.language;
}
if ((retainOldMask & 2) == 0) {
script = result.script;
if (!retainScript) {
script = matched.script;
}
if ((retainOldMask & 1) == 0) {
region = result.region;
if (!retainRegion) {
region = matched.region;
}
int32_t retainMask = (retainLanguage ? 4 : 0) + (retainScript ? 2 : 0) + (retainRegion ? 1 : 0);
// retainOldMask flags = LSR explicit-subtag flags
return LSR(language, script, region, retainOldMask);
return LSR(language, script, region, retainMask, errorCode);
}
int32_t XLikelySubtags::compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const {
@ -627,57 +814,97 @@ int32_t XLikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) {
default: return -1;
}
}
// TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code
// in loclikely.cpp to this new code, including activating this
// minimizeSubtags() function. The LocaleMatcher does not minimize.
#if 0
LSR XLikelySubtags::minimizeSubtags(const char *languageIn, const char *scriptIn,
const char *regionIn, ULocale.Minimize fieldToFavor,
UErrorCode &errorCode) const {
LSR result = maximize(languageIn, scriptIn, regionIn);
// We could try just a series of checks, like:
// LSR result2 = addLikelySubtags(languageIn, "", "");
// if result.equals(result2) return result2;
// However, we can optimize 2 of the cases:
// (languageIn, "", "")
// (languageIn, "", regionIn)
// value00 = lookup(result.language, "", "")
BytesTrie iter = new BytesTrie(trie);
int value = trieNext(iter, result.language, 0);
U_ASSERT(value >= 0);
if (value == 0) {
value = trieNext(iter, "", 0);
U_ASSERT(value >= 0);
if (value == 0) {
value = trieNext(iter, "", 0);
int32_t XLikelySubtags::trieNext(BytesTrie &iter, StringPiece s, int32_t i) {
UStringTrieResult result;
uint8_t c;
if (s.length() == i) {
result = iter.next(u'*');
} else {
c = s.data()[i];
for (;;) {
c = uprv_invCharToAscii(c);
// EBCDIC: If s[i] is not an invariant character,
// then c is now 0 and will simply not match anything, which is harmless.
if (i+1 != s.length()) {
if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
return -1;
}
c = s.data()[++i];
} else {
// last character of this subtag
result = iter.next(c | 0x80);
break;
}
}
}
U_ASSERT(value > 0);
LSR value00 = lsrs[value];
boolean favorRegionOk = false;
if (result.script.equals(value00.script)) { //script is default
if (result.region.equals(value00.region)) {
return new LSR(result.language, "", "", LSR.DONT_CARE_FLAGS);
} else if (fieldToFavor == ULocale.Minimize.FAVOR_REGION) {
return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS);
} else {
favorRegionOk = true;
}
switch (result) {
case USTRINGTRIE_NO_MATCH: return -1;
case USTRINGTRIE_NO_VALUE: return 0;
case USTRINGTRIE_INTERMEDIATE_VALUE:
U_ASSERT(iter.getValue() == SKIP_SCRIPT);
return SKIP_SCRIPT;
case USTRINGTRIE_FINAL_VALUE: return iter.getValue();
default: return -1;
}
// The last case is not as easy to optimize.
// Maybe do later, but for now use the straightforward code.
LSR result2 = maximize(languageIn, scriptIn, "");
if (result2.equals(result)) {
return new LSR(result.language, result.script, "", LSR.DONT_CARE_FLAGS);
} else if (favorRegionOk) {
return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS);
}
return result;
}
#endif
LSR XLikelySubtags::minimizeSubtags(StringPiece language, StringPiece script,
StringPiece region,
bool favorScript,
UErrorCode &errorCode) const {
LSR max = maximize(language, script, region, true, errorCode);
if (U_FAILURE(errorCode)) {
return max;
}
// If no match, return it.
if (uprv_strlen(max.language) == 0 &&
uprv_strlen(max.script) == 0 &&
uprv_strlen(max.region) == 0) {
// No match. ICU API mandate us to
// "If this Locale is already in the minimal form, or not valid, or
// there is no data available for minimization, the Locale will be
// unchanged."
return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode);
}
// try language
LSR test = maximize(max.language, "", "", true, errorCode);
if (U_FAILURE(errorCode)) {
return max;
}
if (test.isEquivalentTo(max)) {
return LSR(max.language, "", "", LSR::DONT_CARE_FLAGS, errorCode);
}
if (!favorScript) {
// favor Region
// try language and region
test = maximize(max.language, "", max.region, true, errorCode);
if (U_FAILURE(errorCode)) {
return max;
}
if (test.isEquivalentTo(max)) {
return LSR(max.language, "", max.region, LSR::DONT_CARE_FLAGS, errorCode);
}
}
// try language and script
test = maximize(max.language, max.script, "", true, errorCode);
if (U_FAILURE(errorCode)) {
return max;
}
if (test.isEquivalentTo(max)) {
return LSR(max.language, max.script, "", LSR::DONT_CARE_FLAGS, errorCode);
}
if (favorScript) {
// try language and region
test = maximize(max.language, "", max.region, true, errorCode);
if (U_FAILURE(errorCode)) {
return max;
}
if (test.isEquivalentTo(max)) {
return LSR(max.language, "", max.region, LSR::DONT_CARE_FLAGS, errorCode);
}
}
return LSR(max.language, max.script, max.region, LSR::DONT_CARE_FLAGS, errorCode);
}
U_NAMESPACE_END

View file

@ -11,6 +11,7 @@
#include "unicode/utypes.h"
#include "unicode/bytestrie.h"
#include "unicode/locid.h"
#include "unicode/stringpiece.h"
#include "unicode/uobject.h"
#include "unicode/ures.h"
#include "charstrmap.h"
@ -47,7 +48,9 @@ public:
static const XLikelySubtags *getSingleton(UErrorCode &errorCode);
// VisibleForTesting
LSR makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const;
LSR makeMaximizedLsrFrom(const Locale &locale,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const;
/**
* Tests whether lsr is "more likely" than other.
@ -61,13 +64,9 @@ public:
*/
int32_t compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const;
// TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code
// in loclikely.cpp to this new code, including activating this
// minimizeSubtags() function. The LocaleMatcher does not minimize.
#if 0
LSR minimizeSubtags(const char *languageIn, const char *scriptIn, const char *regionIn,
ULocale.Minimize fieldToFavor, UErrorCode &errorCode) const;
#endif
LSR minimizeSubtags(StringPiece language, StringPiece script, StringPiece region,
bool favorScript,
UErrorCode &errorCode) const;
// visible for LocaleDistance
const LocaleDistanceData &getDistanceData() const { return distanceData; }
@ -80,16 +79,25 @@ private:
static void initLikelySubtags(UErrorCode &errorCode);
LSR makeMaximizedLsr(const char *language, const char *script, const char *region,
const char *variant, UErrorCode &errorCode) const;
const char *variant,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const;
/**
* Raw access to addLikelySubtags. Input must be in canonical format, eg "en", not "eng" or "EN".
*/
LSR maximize(const char *language, const char *script, const char *region) const;
LSR maximize(const char *language, const char *script, const char *region,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const;
LSR maximize(StringPiece language, StringPiece script, StringPiece region,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const;
int32_t getLikelyIndex(const char *language, const char *script) const;
bool isMacroregion(StringPiece& region, UErrorCode &errorCode) const;
static int32_t trieNext(BytesTrie &iter, const char *s, int32_t i);
static int32_t trieNext(BytesTrie &iter, StringPiece s, int32_t i);
UResourceBundle *langInfoBundle;
// We could store the strings by value, except that if there were few enough strings,

View file

@ -1170,7 +1170,7 @@ uprv_convertToLCIDPlatform(const char* localeID, UErrorCode* status)
// conversion functionality when available.
#if U_PLATFORM_HAS_WIN32_API && UCONFIG_USE_WINDOWS_LCID_MAPPING_API
int32_t len;
char baseName[ULOC_FULLNAME_CAPACITY] = {};
icu::CharString baseName;
const char * mylocaleID = localeID;
// Check any for keywords.
@ -1189,19 +1189,23 @@ uprv_convertToLCIDPlatform(const char* localeID, UErrorCode* status)
else
{
// If the locale ID contains keywords other than collation, just use the base name.
len = uloc_getBaseName(localeID, baseName, UPRV_LENGTHOF(baseName) - 1, status);
if (U_SUCCESS(*status) && len > 0)
{
baseName[len] = 0;
mylocaleID = baseName;
icu::CharStringByteSink sink(&baseName);
ulocimp_getBaseName(localeID, sink, status);
}
if (U_SUCCESS(*status) && !baseName.isEmpty())
{
mylocaleID = baseName.data();
}
}
}
char asciiBCP47Tag[LOCALE_NAME_MAX_LENGTH] = {};
// this will change it from de_DE@collation=phonebook to de-DE-u-co-phonebk form
(void)uloc_toLanguageTag(mylocaleID, asciiBCP47Tag, UPRV_LENGTHOF(asciiBCP47Tag), false, status);
icu::CharString asciiBCP47Tag;
{
icu::CharStringByteSink sink(&asciiBCP47Tag);
ulocimp_toLanguageTag(mylocaleID, sink, false, status);
}
if (U_SUCCESS(*status))
{

View file

@ -24,6 +24,8 @@
#include "unicode/putil.h"
#include "unicode/uloc.h"
#include "unicode/ures.h"
#include "bytesinkutil.h"
#include "charstr.h"
#include "cstring.h"
#include "ulocimp.h"
#include "uresimp.h"
@ -156,16 +158,18 @@ _uloc_getOrientationHelper(const char* localeId,
ULayoutType result = ULOC_LAYOUT_UNKNOWN;
if (!U_FAILURE(*status)) {
int32_t length = 0;
char localeBuffer[ULOC_FULLNAME_CAPACITY];
uloc_canonicalize(localeId, localeBuffer, sizeof(localeBuffer), status);
icu::CharString localeBuffer;
{
icu::CharStringByteSink sink(&localeBuffer);
ulocimp_canonicalize(localeId, sink, status);
}
if (!U_FAILURE(*status)) {
int32_t length = 0;
const char16_t* const value =
uloc_getTableStringWithFallback(
nullptr,
localeBuffer,
localeBuffer.data(),
"layout",
nullptr,
key,

View file

@ -31,6 +31,26 @@ LSR::LSR(char prefix, const char *lang, const char *scr, const char *r, int32_t
}
}
LSR::LSR(StringPiece lang, StringPiece scr, StringPiece r, int32_t f,
UErrorCode &errorCode) :
language(nullptr), script(nullptr), region(nullptr),
regionIndex(indexForRegion(r.data())), flags(f) {
if (U_SUCCESS(errorCode)) {
CharString data;
data.append(lang, errorCode).append('\0', errorCode);
int32_t scriptOffset = data.length();
data.append(scr, errorCode).append('\0', errorCode);
int32_t regionOffset = data.length();
data.append(r, errorCode);
owned = data.cloneData(errorCode);
if (U_SUCCESS(errorCode)) {
language = owned;
script = owned + scriptOffset;
region = owned + regionOffset;
}
}
}
LSR::LSR(LSR &&other) noexcept :
language(other.language), script(other.script), region(other.region), owned(other.owned),
regionIndex(other.regionIndex), flags(other.flags),

View file

@ -7,6 +7,7 @@
#ifndef __LSR_H__
#define __LSR_H__
#include "unicode/stringpiece.h"
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "cstring.h"
@ -45,6 +46,8 @@ struct LSR final : public UMemory {
*/
LSR(char prefix, const char *lang, const char *scr, const char *r, int32_t f,
UErrorCode &errorCode);
LSR(StringPiece lang, StringPiece scr, StringPiece r, int32_t f,
UErrorCode &errorCode);
LSR(LSR &&other) noexcept;
LSR(const LSR &other) = delete;
inline ~LSR() {

View file

@ -10,7 +10,7 @@
#ifdef INCLUDED_FROM_NORMALIZER2_CPP
static const UVersionInfo norm2_nfc_data_formatVersion={4,0,0,0};
static const UVersionInfo norm2_nfc_data_dataVersion={0xf,0,0,0};
static const UVersionInfo norm2_nfc_data_dataVersion={0xf,1,0,0};
static const int32_t norm2_nfc_data_indexes[Normalizer2Impl::IX_COUNT]={
0x50,0x4cb8,0x8920,0x8a20,0x8a20,0x8a20,0x8a20,0x8a20,0xc0,0x300,0xae2,0x29e0,0x3c66,0xfc00,0x1288,0x3b9c,

View file

@ -391,6 +391,7 @@ struct Norm2AllModes : public UMemory {
static const Norm2AllModes *getNFCInstance(UErrorCode &errorCode);
static const Norm2AllModes *getNFKCInstance(UErrorCode &errorCode);
static const Norm2AllModes *getNFKC_CFInstance(UErrorCode &errorCode);
static const Norm2AllModes *getNFKC_SCFInstance(UErrorCode &errorCode);
Normalizer2Impl *impl;
ComposeNormalizer2 comp;

View file

@ -789,7 +789,8 @@ unorm_getFCD16(UChar32 c);
*
* Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.
* ICU ships with data files for standard Unicode Normalization Forms
* NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm) and NFKC_Casefold (nfkc_cf.nrm).
* NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm),
* NFKC_Casefold (nfkc_cf.nrm) and NFKC_Simple_Casefold (nfkc_scf.nrm).
* Custom (application-specific) data can be built into additional .nrm files
* with the gennorm2 build tool.
* ICU ships with one such file, uts46.nrm, for the implementation of UTS #46.

File diff suppressed because it is too large Load diff

View file

@ -1175,6 +1175,21 @@ uprv_tzname(int n)
if (ret != nullptr && uprv_strcmp(TZDEFAULT, gTimeZoneBuffer) != 0) {
int32_t tzZoneInfoTailLen = uprv_strlen(TZZONEINFOTAIL);
const char *tzZoneInfoTailPtr = uprv_strstr(gTimeZoneBuffer, TZZONEINFOTAIL);
// MacOS14 has the realpath as something like
// /usr/share/zoneinfo.default/Australia/Melbourne
// which will not have "/zoneinfo/" in the path.
// Therefore if we fail, we fall back to read the link which is
// /var/db/timezone/zoneinfo/Australia/Melbourne
// We also fall back to reading the link if the realpath leads to something like
// /usr/share/zoneinfo/posixrules
if (tzZoneInfoTailPtr == nullptr ||
uprv_strcmp(tzZoneInfoTailPtr + tzZoneInfoTailLen, "posixrules") == 0) {
ssize_t size = readlink(TZDEFAULT, gTimeZoneBuffer, sizeof(gTimeZoneBuffer)-1);
if (size > 0) {
gTimeZoneBuffer[size] = 0;
tzZoneInfoTailPtr = uprv_strstr(gTimeZoneBuffer, TZZONEINFOTAIL);
}
}
if (tzZoneInfoTailPtr != nullptr) {
tzZoneInfoTailPtr += tzZoneInfoTailLen;
skipZoneIDPrefix(&tzZoneInfoTailPtr);

View file

@ -1125,6 +1125,7 @@ static icu::UStack *gLanguageBreakFactories = nullptr;
static const icu::UnicodeString *gEmptyString = nullptr;
static icu::UInitOnce gLanguageBreakFactoriesInitOnce {};
static icu::UInitOnce gRBBIInitOnce {};
static icu::ICULanguageBreakFactory *gICULanguageBreakFactory = nullptr;
/**
* Release all static memory held by breakiterator.
@ -1153,37 +1154,41 @@ static void U_CALLCONV rbbiInit() {
ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
}
static void U_CALLCONV initLanguageFactories() {
UErrorCode status = U_ZERO_ERROR;
static void U_CALLCONV initLanguageFactories(UErrorCode& status) {
U_ASSERT(gLanguageBreakFactories == nullptr);
gLanguageBreakFactories = new UStack(_deleteFactory, nullptr, status);
if (gLanguageBreakFactories != nullptr && U_SUCCESS(status)) {
ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
gLanguageBreakFactories->push(builtIn, status);
LocalPointer<ICULanguageBreakFactory> factory(new ICULanguageBreakFactory(status), status);
if (U_SUCCESS(status)) {
gICULanguageBreakFactory = factory.orphan();
gLanguageBreakFactories->push(gICULanguageBreakFactory, status);
#ifdef U_LOCAL_SERVICE_HOOK
LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
if (extra != nullptr) {
gLanguageBreakFactories->push(extra, status);
}
LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
if (extra != nullptr) {
gLanguageBreakFactories->push(extra, status);
}
#endif
}
}
ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
}
void ensureLanguageFactories(UErrorCode& status) {
umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories, status);
}
static const LanguageBreakEngine*
getLanguageBreakEngineFromFactory(UChar32 c)
getLanguageBreakEngineFromFactory(UChar32 c, const char* locale)
{
umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
if (gLanguageBreakFactories == nullptr) {
return nullptr;
}
UErrorCode status = U_ZERO_ERROR;
ensureLanguageFactories(status);
if (U_FAILURE(status)) return nullptr;
int32_t i = gLanguageBreakFactories->size();
const LanguageBreakEngine *lbe = nullptr;
while (--i >= 0) {
LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
lbe = factory->getEngineFor(c);
lbe = factory->getEngineFor(c, locale);
if (lbe != nullptr) {
break;
}
@ -1199,7 +1204,7 @@ getLanguageBreakEngineFromFactory(UChar32 c)
//
//-------------------------------------------------------------------------------
const LanguageBreakEngine *
RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c, const char* locale) {
const LanguageBreakEngine *lbe = nullptr;
UErrorCode status = U_ZERO_ERROR;
@ -1215,14 +1220,14 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
int32_t i = fLanguageBreakEngines->size();
while (--i >= 0) {
lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
if (lbe->handles(c)) {
if (lbe->handles(c, locale)) {
return lbe;
}
}
// No existing dictionary took the character. See if a factory wants to
// give us a new LanguageBreakEngine for this character.
lbe = getLanguageBreakEngineFromFactory(c);
lbe = getLanguageBreakEngineFromFactory(c, locale);
// If we got one, use it and push it on our stack.
if (lbe != nullptr) {
@ -1259,6 +1264,18 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
return fUnhandledBreakEngine;
}
#ifndef U_HIDE_DRAFT_API
void U_EXPORT2 RuleBasedBreakIterator::registerExternalBreakEngine(
ExternalBreakEngine* toAdopt, UErrorCode& status) {
LocalPointer<ExternalBreakEngine> engine(toAdopt, status);
if (U_FAILURE(status)) return;
ensureLanguageFactories(status);
if (U_FAILURE(status)) return;
gICULanguageBreakFactory->addExternalEngine(engine.orphan(), status);
}
#endif /* U_HIDE_DRAFT_API */
void RuleBasedBreakIterator::dumpCache() {
fBreakCache->dumpCache();
}

View file

@ -158,12 +158,13 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
// We now have a dictionary character. Get the appropriate language object
// to deal with it.
const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c);
const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));
// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != nullptr) {
foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
}
// Reload the loop variables for the next go-round

View file

@ -66,7 +66,6 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
fForwardTable = nullptr;
fRuleStatusVals = nullptr;
fChainRules = false;
fLBCMNoChain = false;
fLookAheadHardBreak = false;
fUSetNodes = nullptr;
fRuleStatusVals = nullptr;

View file

@ -159,9 +159,6 @@ public:
UBool fChainRules; // True for chained Unicode TR style rules.
// False for traditional regexp rules.
UBool fLBCMNoChain; // True: suppress chaining of rules on
// chars with LineBreak property == CM.
UBool fLookAheadHardBreak; // True: Look ahead matches cause an
// immediate break, no continuing for the
// longest match.

View file

@ -547,8 +547,6 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
if (opt == UNICODE_STRING("chain", 5)) {
fRB->fChainRules = true;
} else if (opt == UNICODE_STRING("LBCMNoChain", 11)) {
fRB->fLBCMNoChain = true;
} else if (opt == UNICODE_STRING("forward", 7)) {
fRB->fDefaultTree = &fRB->fForwardTree;
} else if (opt == UNICODE_STRING("reverse", 7)) {

View file

@ -458,21 +458,6 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNod
// We've got a node that can end a match.
// !!LBCMNoChain implementation: If this node's val correspond to
// the Line Break $CM char class, don't chain from it.
// TODO: Remove this. !!LBCMNoChain is deprecated, and is not used
// by any of the standard ICU rules.
if (fRB->fLBCMNoChain) {
UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
if (c != -1) {
// c == -1 occurs with sets containing only the {eof} marker string.
ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
if (cLBProp == U_LB_COMBINING_MARK) {
continue;
}
}
}
// Now iterate over the nodes that can start a match, looking for ones
// with the same char class as our ending node.
RBBINode *startNode;

View file

@ -9,11 +9,11 @@
#ifdef INCLUDED_FROM_UBIDI_PROPS_C
static const UVersionInfo ubidi_props_dataVersion={0xf,0,0,0};
static const UVersionInfo ubidi_props_dataVersion={0xf,1,0,0};
static const int32_t ubidi_props_indexes[UBIDI_IX_TOP]={0x10,0x6bc0,0x65d0,0x28,0x620,0x8cc,0x10ac0,0x10d24,0,0,0,0,0,0,0,0x6702b6};
static const int32_t ubidi_props_indexes[UBIDI_IX_TOP]={0x10,0x6ba0,0x65b0,0x28,0x620,0x8cc,0x10ac0,0x10d24,0,0,0,0,0,0,0,0x6702b6};
static const uint16_t ubidi_props_trieIndex[13024]={
static const uint16_t ubidi_props_trieIndex[13008]={
0x387,0x38f,0x397,0x39f,0x3b7,0x3bf,0x3c7,0x3cf,0x3a7,0x3af,0x3a7,0x3af,0x3a7,0x3af,0x3a7,0x3af,
0x3a7,0x3af,0x3a7,0x3af,0x3d5,0x3dd,0x3e5,0x3ed,0x3f5,0x3fd,0x3f9,0x401,0x409,0x411,0x40c,0x414,
0x3a7,0x3af,0x3a7,0x3af,0x41c,0x424,0x3a7,0x3af,0x3a7,0x3af,0x3a7,0x3af,0x42a,0x432,0x43a,0x442,
@ -38,8 +38,8 @@ static const uint16_t ubidi_props_trieIndex[13024]={
0x7e8,0x7f0,0x7f8,0x7ff,0x806,0x80e,0x812,0x7e0,0x67c,0x67c,0x67c,0x81a,0x820,0x67c,0x67c,0x826,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x82e,0x3a7,0x3a7,0x3a7,0x836,0x3a7,0x3a7,0x3a7,0x3f5,
0x83e,0x846,0x849,0x3a7,0x851,0x67c,0x67c,0x67f,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x858,0x85e,
0x86e,0x866,0x3a7,0x3a7,0x876,0x61f,0x3a7,0x3ce,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x67c,0x835,
0x3dc,0x3a7,0x87e,0x886,0x3a7,0x88e,0x896,0x3a7,0x3a7,0x3a7,0x3a7,0x89a,0x3a7,0x3a7,0x674,0x3cd,
0x86e,0x866,0x3a7,0x3a7,0x876,0x61f,0x3a7,0x3ce,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x67c,0x87e,
0x3dc,0x3a7,0x85e,0x882,0x3a7,0x88a,0x892,0x3a7,0x3a7,0x3a7,0x3a7,0x896,0x3a7,0x3a7,0x674,0x3cd,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
@ -96,10 +96,10 @@ static const uint16_t ubidi_props_trieIndex[13024]={
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x87e,0x67c,0x595,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x8a1,0x3a7,0x3a7,0x8a6,0x8ae,0x3a7,0x3a7,0x5cb,0x67c,0x673,0x3a7,0x3a7,0x8b6,0x3a7,0x3a7,0x3a7,
0x8be,0x8c5,0x645,0x8cd,0x3a7,0x3a7,0x5a1,0x8d5,0x3a7,0x8dd,0x8e4,0x3a7,0x501,0x8e9,0x3a7,0x51a,
0x3a7,0x8f1,0x8f9,0x51c,0x3a7,0x8fd,0x51b,0x905,0x3a7,0x3a7,0x3a7,0x90b,0x3a7,0x3a7,0x3a7,0x912,
0x3a7,0x3a7,0x3a7,0x3a7,0x85e,0x67c,0x595,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x89d,0x3a7,0x3a7,0x8a2,0x8aa,0x3a7,0x3a7,0x5cb,0x67c,0x673,0x3a7,0x3a7,0x8b2,0x3a7,0x3a7,0x3a7,
0x8ba,0x8c1,0x645,0x8c9,0x3a7,0x3a7,0x5a1,0x8d1,0x3a7,0x8d9,0x8e0,0x3a7,0x501,0x8e5,0x3a7,0x51a,
0x3a7,0x8ed,0x8f5,0x51c,0x3a7,0x8f9,0x51b,0x901,0x3a7,0x3a7,0x3a7,0x907,0x3a7,0x3a7,0x3a7,0x90e,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
@ -139,9 +139,9 @@ static const uint16_t ubidi_props_trieIndex[13024]={
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x926,0x91a,0x91e,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,
0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x92e,0x936,0x4a6,0x4a6,0x4a6,0x93b,0x93f,
0x947,0x94f,0x953,0x95b,0x4a6,0x4a6,0x4a6,0x95f,0x967,0x397,0x96f,0x977,0x3a7,0x3a7,0x3a7,0x97f,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x922,0x916,0x91a,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,
0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x92a,0x932,0x4a6,0x4a6,0x4a6,0x937,0x93b,
0x943,0x94b,0x94f,0x957,0x4a6,0x4a6,0x4a6,0x95b,0x963,0x397,0x96b,0x973,0x3a7,0x3a7,0x3a7,0x97b,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0xe9c,0xe9c,0xedc,0xf1c,0xe9c,0xe9c,0xe9c,0xe9c,0xe9c,0xe9c,0xf54,0xf94,0xfd4,0xfe4,0x1024,0x1030,
@ -178,68 +178,68 @@ static const uint16_t ubidi_props_trieIndex[13024]={
0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0xd89,
0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,
0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0x1a0,0xd89,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x987,0x3a7,0x67c,0x67c,0x98f,0x61f,0x3a7,0x514,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x997,0x3a7,0x3a7,0x3a7,0x99e,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x983,0x3a7,0x67c,0x67c,0x98b,0x61f,0x3a7,0x514,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x993,0x3a7,0x3a7,0x3a7,0x99a,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x9a6,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,
0x9ae,0x9b2,0x43c,0x43c,0x43c,0x43c,0x9c2,0x9ba,0x43c,0x9ca,0x43c,0x43c,0x9d2,0x9d8,0x43c,0x43c,
0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x9e8,0x9e0,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,
0x43c,0x43c,0x43c,0x9f0,0x43c,0x9f8,0x4a6,0xa00,0x43c,0xa08,0xa0f,0xa15,0xa1d,0xa21,0xa29,0x43c,
0x51b,0xa31,0xa38,0xa3f,0x41e,0xa47,0x569,0x3a7,0x501,0xa4e,0x3a7,0xa54,0x41e,0xa59,0xa61,0x3a7,
0x3a7,0xa66,0x51b,0x3a7,0x3a7,0x3a7,0x836,0xa6e,0x41e,0x5a3,0x57e,0xa75,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0xa31,0xa7d,0x3a7,0x3a7,0xa85,0xa8d,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xa91,0xa99,0x3a7,
0x3a7,0xaa1,0x57e,0xaa9,0x3a7,0xaaf,0x3a7,0x3a7,0x60f,0xab7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0xabc,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xac3,0xacb,0x3a7,0x3a7,0x3a7,0xace,0x57e,0xad6,
0xada,0xae2,0x3a7,0xae9,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0xaf0,0x3a7,0x3a7,0xafe,0xaf8,0x3a7,0x3a7,0x3a7,0xb06,0xb0e,0x3a7,0xb12,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x5a5,0x41e,0x99e,0xb1a,0x3a7,0x3a7,0x3a7,0xb27,0xb22,0x3a7,
0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x9a2,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,
0x9aa,0x9ae,0x43c,0x43c,0x43c,0x43c,0x9be,0x9b6,0x43c,0x9c6,0x43c,0x43c,0x9ce,0x9d4,0x43c,0x43c,
0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x9e4,0x9dc,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,
0x43c,0x43c,0x43c,0x9ec,0x43c,0x9f4,0x4a6,0x9fc,0x43c,0xa04,0xa0b,0xa11,0xa19,0xa1d,0xa25,0x43c,
0x51b,0xa2d,0xa34,0xa3b,0x41e,0xa43,0x569,0x3a7,0x501,0xa4a,0x3a7,0xa50,0x41e,0xa55,0xa5d,0x3a7,
0x3a7,0xa62,0x51b,0x3a7,0x3a7,0x3a7,0x836,0xa6a,0x41e,0x5a3,0x57e,0xa71,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0xa2d,0xa79,0x3a7,0x3a7,0xa81,0xa89,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xa8d,0xa95,0x3a7,
0x3a7,0xa9d,0x57e,0xaa5,0x3a7,0xaab,0x3a7,0x3a7,0x60f,0xab3,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0xab8,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xabf,0xac7,0x3a7,0x3a7,0x3a7,0xaca,0x57e,0xad2,
0xad6,0xade,0x3a7,0xae5,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0xaec,0x3a7,0x3a7,0xafa,0xaf4,0x3a7,0x3a7,0x3a7,0xb02,0xb0a,0x3a7,0xb0e,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x5a5,0x41e,0x99a,0xb16,0x3a7,0x3a7,0x3a7,0xb23,0xb1e,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0xb2f,0xb37,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xb3d,
0x3a7,0xb43,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0xb2b,0xb33,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xb39,
0x3a7,0xb3f,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0xa55,0x3a7,0xb49,0x3a7,0x3a7,0xb51,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0xa51,0x3a7,0xb45,0x3a7,0x3a7,0xb4d,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x535,0xb59,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x535,0xb55,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3f5,0xb61,0x500,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0xb69,0xb71,0xb77,0x3a7,0xb7d,0x67c,0x67c,0xb85,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x67c,0x67c,0xb8d,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xb93,
0x3a7,0xb9a,0x3a7,0xb96,0x3a7,0xb9d,0x3a7,0xba5,0xba9,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3f5,0xbb1,0x3f5,0xbb8,0xbbf,0xbc7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3f5,0xb5d,0x500,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0xb65,0xb6d,0xb73,0x3a7,0xb79,0x67c,0x67c,0xb81,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x67c,0x67c,0xb89,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xb8f,
0x3a7,0xb96,0x3a7,0xb92,0x3a7,0xb99,0x3a7,0xba1,0xba5,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3f5,0xbad,0x3f5,0xbb4,0xbbb,0xbc3,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xbcf,0xbd7,0x3a7,0x3a7,0xa55,0x3a7,0x3a7,
0x3a7,0x3a7,0xb43,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xa81,0x3a7,
0xbdc,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0xbe4,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0xbec,
0x43c,0xbf4,0xbf4,0xbfb,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,
0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x91e,0x4a6,0x4a6,0x43c,
0x43c,0x4a6,0x4a6,0xc03,0x43c,0x43c,0x43c,0x43c,0x43c,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,
0xc0b,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x67c,0xc13,0x67c,0x67c,0x67f,0xc18,0xc1c,
0x858,0xc24,0x3c9,0x3a7,0xc2a,0x3a7,0xc2f,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x783,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xbcb,0xbd3,0x3a7,0x3a7,0xa51,0x3a7,0x3a7,
0x3a7,0x3a7,0xb3f,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xa7d,0x3a7,
0xbd8,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0xbe0,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0xbe8,
0x43c,0xbf0,0xbf0,0xbf7,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,
0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x91a,0x4a6,0x4a6,0x43c,
0x43c,0x4a6,0x4a6,0xbff,0x43c,0x43c,0x43c,0x43c,0x43c,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,0x4a6,
0xc07,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x43c,0x67c,0xc0f,0x67c,0x67c,0x67f,0xc14,0xc18,
0x858,0xc20,0x3c9,0x3a7,0xc26,0x3a7,0xc2b,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x783,0x3a7,0x3a7,0x3a7,
0x3a7,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,
0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0xc37,
0x98f,0x67c,0x67c,0x67c,0xc3e,0x67c,0x67c,0xc45,0xc4d,0xc13,0x67c,0xc55,0x67c,0xc5d,0xc62,0x3a7,
0x3a7,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67f,0xc6a,0xc73,0xc77,0xc7f,
0xc6f,0x67c,0x67c,0x67c,0x67c,0xc87,0x67c,0x792,0xc8f,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0xc33,
0x98b,0x67c,0x67c,0x67c,0xc3a,0x67c,0x67c,0xc41,0xc49,0xc0f,0x67c,0xc51,0x67c,0xc59,0xc5e,0x3a7,
0x3a7,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67c,0x67f,0xc66,0xc6f,0xc73,0xc7b,
0xc6b,0x67c,0x67c,0x67c,0x67c,0xc83,0x67c,0x792,0xc8b,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xc96,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xc92,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xc96,0xca6,0xc9e,0xc9e,0xc9e,0xca7,0xca7,0xca7,
0xca7,0x3f5,0x3f5,0x3f5,0x3f5,0x3f5,0x3f5,0x3f5,0xcaf,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,
0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,
0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,
0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,
0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0xca7,0x386,0x386,0x386,0x12,0x12,0x12,0x12,
0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0x3a7,0xc92,0xca2,0xc9a,0xc9a,0xc9a,0xca3,0xca3,0xca3,
0xca3,0x3f5,0x3f5,0x3f5,0x3f5,0x3f5,0x3f5,0x3f5,0xcab,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,
0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,
0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,
0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,
0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0xca3,0x386,0x386,0x386,0x12,0x12,0x12,0x12,
0x12,0x12,0x12,0x12,0x12,8,7,8,9,7,0x12,0x12,0x12,0x12,0x12,0x12,
0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12,7,7,7,8,9,0xa,0xa,4,
4,4,0xa,0xa,0x310a,0xf20a,0xa,3,6,3,6,6,2,2,2,2,
@ -551,15 +551,14 @@ static const uint16_t ubidi_props_trieIndex[13024]={
0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
0xa,0xa,0xa,0xa,0xa,0xa,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
0xa,0xa,0xa,0xa,0,0,0,0,0xa,0,0,0,0,0,0,0,
0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0,0,0,0,0,0,0,
0,0,0xb1,0xb1,0xb1,0xb1,0,0,0xa,0,0,0,0,0,0xa,0xa,
0,0,0,0,0,0xa,0xa,0xa,9,0xa,0xa,0xa,0xa,0,0,0,
0x310a,0xf20a,0x310a,0xf20a,0x310a,0xf20a,0x310a,0xf20a,0x310a,0xf20a,0xa,0xa,0x310a,0xf20a,0x310a,0xf20a,
0x310a,0xf20a,0x310a,0xf20a,0xa,0xa,0xa,0xa,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0xb1,0xb1,0xa,0xa,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
0xa,0xa,0xa,0xa,0xa,0xa,0xa,0xa,0,0,0,0,0,0,0,0,
0,0xb1,0xb1,0xa,0xa,0,0,0,0xa,0xa,0xa,0xa,0,0,0,0,
0,0,0,0,0,0,0,0xa,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0xa,0xa,0xa,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0xa,0xa,0xa,0xa,0xa,0xa,0xa,
@ -935,13 +934,13 @@ static const UBiDiProps ubidi_props_singleton={
ubidi_props_trieIndex+3612,
nullptr,
3612,
9412,
9396,
0x1a0,
0xe9c,
0x0,
0x0,
0x110000,
0x32dc,
0x32cc,
nullptr, 0, false, false, 0, nullptr
},
{ 2,2,0,0 }

View file

@ -317,43 +317,6 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
}
}
namespace {
/**
* Add the simple case closure mapping,
* except if there is not actually an scf relationship between the two characters.
* TODO: Unicode should probably add the corresponding scf mappings.
* See https://crbug.com/v8/13377 and Unicode-internal PAG issue #23.
* If & when those scf mappings are added, we should be able to remove all of these exceptions.
*/
void addOneSimpleCaseClosure(UChar32 c, UChar32 t, const USetAdder *sa) {
switch (c) {
case 0x0390:
if (t == 0x1FD3) { return; }
break;
case 0x03B0:
if (t == 0x1FE3) { return; }
break;
case 0x1FD3:
if (t == 0x0390) { return; }
break;
case 0x1FE3:
if (t == 0x03B0) { return; }
break;
case 0xFB05:
if (t == 0xFB06) { return; }
break;
case 0xFB06:
if (t == 0xFB05) { return; }
break;
default:
break;
}
sa->add(sa->set, t);
}
} // namespace
U_CFUNC void U_EXPORT2
ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
@ -397,7 +360,7 @@ ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
pe=pe0;
UChar32 mapping;
GET_SLOT_VALUE(excWord, idx, pe, mapping);
addOneSimpleCaseClosure(c, mapping, sa);
sa->add(sa->set, mapping);
}
}
if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
@ -405,7 +368,7 @@ ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
int32_t delta;
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
UChar32 mapping = (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
addOneSimpleCaseClosure(c, mapping, sa);
sa->add(sa->set, mapping);
}
/* get the closure string pointer & length */
@ -448,7 +411,7 @@ ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
for(int32_t idx=0; idx<closureLength;) {
UChar32 mapping;
U16_NEXT_UNSAFE(closure, idx, mapping);
addOneSimpleCaseClosure(c, mapping, sa);
sa->add(sa->set, mapping);
}
}
}

View file

@ -9,9 +9,9 @@
#ifdef INCLUDED_FROM_UCASE_CPP
static const UVersionInfo ucase_props_dataVersion={0xf,0,0,0};
static const UVersionInfo ucase_props_dataVersion={0xf,1,0,0};
static const int32_t ucase_props_indexes[UCASE_IX_TOP]={0x10,0x76f2,0x66c8,0x683,0x172,0,0,0,0,0,0,0,0,0,0,3};
static const int32_t ucase_props_indexes[UCASE_IX_TOP]={0x10,0x76ec,0x66c8,0x680,0x172,0,0,0,0,0,0,0,0,0,0,3};
static const uint16_t ucase_props_trieIndex[13148]={
0x355,0x35d,0x365,0x36d,0x37b,0x383,0x38b,0x393,0x39b,0x3a3,0x3aa,0x3b2,0x3ba,0x3c2,0x3ca,0x3d2,
@ -509,9 +509,9 @@ static const uint16_t ucase_props_trieIndex[13148]={
0x39b9,0x3a29,0x3a99,0x3b09,0x3b7b,0x3beb,0x3c5b,0x3ccb,0x3d3b,0x3dab,0x3e1b,0x3e8b,0x411,0x411,0x3ef9,0x3f79,
0x3fe9,0,0x4069,0x40e9,0xfc12,0xfc12,0xdb12,0xdb12,0x419b,4,0x4209,4,4,4,0x4259,0x42d9,
0x4349,0,0x43c9,0x4449,0xd512,0xd512,0xd512,0xd512,0x44fb,4,4,4,0x411,0x411,0x4569,0x4619,
0,0,0x46e9,0x4769,0xfc12,0xfc12,0xce12,0xce12,0,4,4,4,0x411,0x411,0x4819,0x48c9,
0x4999,0x391,0x4a19,0x4a99,0xfc12,0xfc12,0xc812,0xc812,0xfc92,4,4,4,0,0,0x4b49,0x4bc9,
0x4c39,0,0x4cb9,0x4d39,0xc012,0xc012,0xc112,0xc112,0x4deb,4,4,0,0,0,0,0,
0,0,0x46d9,0x4759,0xfc12,0xfc12,0xce12,0xce12,0,4,4,4,0x411,0x411,0x4809,0x48b9,
0x4979,0x391,0x49f9,0x4a79,0xfc12,0xfc12,0xc812,0xc812,0xfc92,4,4,4,0,0,0x4b29,0x4ba9,
0x4c19,0,0x4c99,0x4d19,0xc012,0xc012,0xc112,0xc112,0x4dcb,4,4,0,0,0,0,0,
0,0,0,0,0,0,0,4,4,4,4,4,0,0,0,0,
0,0,0,0,4,4,0,0,0,0,0,0,4,0,0,4,
0,0,4,4,4,4,4,0,0,0,0,0,0,0,0,0,
@ -525,8 +525,8 @@ static const uint16_t ucase_props_trieIndex[13148]={
0x64,0x44,0x64,0x64,0x64,0x64,0x64,0x64,0x44,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,2,
0,0,1,2,2,2,1,1,2,2,2,1,0,2,0,0,
0,2,2,2,2,2,0,0,0,0,0,0,2,0,0x4e5a,0,
2,0,0x4e9a,0x4eda,2,2,0,1,2,2,0xe12,2,1,0,0,0,
0,2,2,2,2,2,0,0,0,0,0,0,2,0,0x4e3a,0,
2,0,0x4e7a,0x4eba,2,2,0,1,2,2,0xe12,2,1,0,0,0,
0,1,0,0,1,1,2,2,0,0,0,0,0,2,1,1,
0x21,0x21,0,0,0,0,0xf211,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0x812,0x812,0x812,0x812,0x812,0x812,0x812,0x812,
@ -541,13 +541,13 @@ static const uint16_t ucase_props_trieIndex[13148]={
0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,0x1812,
0x1812,0x1812,0x1812,0x1812,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,
0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,0xe811,
0xe811,0xe811,0xe811,0xe811,0x92,0xff91,0x4f1a,0x4f3a,0x4f5a,0x4f79,0x4f99,0x92,0xff91,0x92,0xff91,0x92,
0xff91,0x4fba,0x4fda,0x4ffa,0x501a,1,0x92,0xff91,1,0x92,0xff91,1,1,1,1,1,
0x25,5,0x503a,0x503a,0x92,0xff91,0x92,0xff91,1,0,0,0,0,0,0,0x92,
0xe811,0xe811,0xe811,0xe811,0x92,0xff91,0x4efa,0x4f1a,0x4f3a,0x4f59,0x4f79,0x92,0xff91,0x92,0xff91,0x92,
0xff91,0x4f9a,0x4fba,0x4fda,0x4ffa,1,0x92,0xff91,1,0x92,0xff91,1,1,1,1,1,
0x25,5,0x501a,0x501a,0x92,0xff91,0x92,0xff91,1,0,0,0,0,0,0,0x92,
0xff91,0x92,0xff91,0x44,0x44,0x44,0x92,0xff91,0,0,0,0,0,0,0,0,
0,0,0,0,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,
0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,
0x5059,0x5059,0x5059,0x5059,0x5059,0x5059,0,0x5059,0,0,0,0,0,0x5059,0,0,
0,0,0,0,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,
0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,
0x5039,0x5039,0x5039,0x5039,0x5039,0x5039,0,0x5039,0,0,0,0,0,0x5039,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0x64,0x44,0x44,0x44,0x44,0x44,0x44,0x44,0x44,0x44,0x44,0x44,0x44,
@ -562,7 +562,7 @@ static const uint16_t ucase_props_trieIndex[13148]={
0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,
0x92,0xff91,0x507a,0x50b9,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,
0x92,0xff91,0x505a,0x5099,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,
0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0,0x44,4,4,4,0,
0x44,0x44,0x44,0x44,0x44,0x44,0x44,0x44,0x44,0x44,0,4,0x92,0xff91,0x92,0xff91,
0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,
@ -573,11 +573,11 @@ static const uint16_t ucase_props_trieIndex[13148]={
4,4,4,4,4,4,4,4,4,4,4,4,4,4,0x92,0xff91,
0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,1,1,0x92,0xff91,
0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,
5,1,1,1,1,1,1,1,1,0x92,0xff91,0x92,0xff91,0x50fa,0x92,0xff91,
0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,4,4,4,0x92,0xff91,0x511a,1,0,
5,1,1,1,1,1,1,1,1,0x92,0xff91,0x92,0xff91,0x50da,0x92,0xff91,
0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,4,4,4,0x92,0xff91,0x50fa,1,0,
0x92,0xff91,0x92,0xff91,0x1811,1,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,
0x92,0xff91,0x513a,0x515a,0x517a,0x519a,0x513a,1,0x51ba,0x51da,0x51fa,0x521a,0x92,0xff91,0x92,0xff91,
0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0xe812,0x523a,0x525a,0x92,0xff91,0x92,0xff91,0,
0x92,0xff91,0x511a,0x513a,0x515a,0x517a,0x511a,1,0x519a,0x51ba,0x51da,0x51fa,0x92,0xff91,0x92,0xff91,
0x92,0xff91,0x92,0xff91,0x92,0xff91,0x92,0xff91,0xe812,0x521a,0x523a,0x92,0xff91,0x92,0xff91,0,
0,0,0,0,0x92,0xff91,0,1,0,1,0x92,0xff91,0x92,0xff91,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,5,5,5,0x92,0xff91,0,5,5,1,0,0,0,0,0,
@ -607,17 +607,17 @@ static const uint16_t ucase_props_trieIndex[13148]={
0,0,0,0,0,0,0,0,0,0,0,0,4,4,0,0,
0,0,0,4,4,0,0x64,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,0x5279,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,0x5259,1,1,1,1,
1,1,1,4,5,5,5,5,1,1,1,1,1,1,1,1,
1,5,4,4,0,0,0,0,0x5299,0x52c9,0x52f9,0x5329,0x5359,0x5389,0x53b9,0x53e9,
0x5419,0x5449,0x5479,0x54a9,0x54d9,0x5509,0x5539,0x5569,0x5b99,0x5bc9,0x5bf9,0x5c29,0x5c59,0x5c89,0x5cb9,0x5ce9,
0x5d19,0x5d49,0x5d79,0x5da9,0x5dd9,0x5e09,0x5e39,0x5e69,0x5e99,0x5ec9,0x5ef9,0x5f29,0x5f59,0x5f89,0x5fb9,0x5fe9,
0x6019,0x6049,0x6079,0x60a9,0x60d9,0x6109,0x6139,0x6169,0x5599,0x55c9,0x55f9,0x5629,0x5659,0x5689,0x56b9,0x56e9,
0x5719,0x5749,0x5779,0x57a9,0x57d9,0x5809,0x5839,0x5869,0x5899,0x58c9,0x58f9,0x5929,0x5959,0x5989,0x59b9,0x59e9,
0x5a19,0x5a49,0x5a79,0x5aa9,0x5ad9,0x5b09,0x5b39,0x5b69,0,0,0,0,0,4,0,0,
1,5,4,4,0,0,0,0,0x5279,0x52a9,0x52d9,0x5309,0x5339,0x5369,0x5399,0x53c9,
0x53f9,0x5429,0x5459,0x5489,0x54b9,0x54e9,0x5519,0x5549,0x5b79,0x5ba9,0x5bd9,0x5c09,0x5c39,0x5c69,0x5c99,0x5cc9,
0x5cf9,0x5d29,0x5d59,0x5d89,0x5db9,0x5de9,0x5e19,0x5e49,0x5e79,0x5ea9,0x5ed9,0x5f09,0x5f39,0x5f69,0x5f99,0x5fc9,
0x5ff9,0x6029,0x6059,0x6089,0x60b9,0x60e9,0x6119,0x6149,0x5579,0x55a9,0x55d9,0x5609,0x5639,0x5669,0x5699,0x56c9,
0x56f9,0x5729,0x5759,0x5789,0x57b9,0x57e9,0x5819,0x5849,0x5879,0x58a9,0x58d9,0x5909,0x5939,0x5969,0x5999,0x59c9,
0x59f9,0x5a29,0x5a59,0x5a89,0x5ab9,0x5ae9,0x5b19,0x5b49,0,0,0,0,0,4,0,0,
4,0,0,0,0,0x64,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0x6199,0x6219,0x6299,0x6319,0x63c9,0x6479,0x6519,0,
0,0,0,0,0,0,0,0,0,0,0,0x65b9,0x6639,0x66b9,0x6739,0x67b9,
0,0,0,0,0,0,0,0,0x6179,0x61f9,0x6279,0x62f9,0x63a9,0x6459,0x64e9,0,
0,0,0,0,0,0,0,0,0,0,0,0x6589,0x6609,0x6689,0x6709,0x6789,
0,0,0,0,0,0,0x64,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,4,4,4,4,4,4,
4,4,4,4,4,4,4,4,4,4,4,4,0,0,0,4,
@ -838,7 +838,7 @@ static const uint16_t ucase_props_trieIndex[13148]={
0,0,0,0,0,0,0,0,0,0,0,0
};
static const uint16_t ucase_props_exceptions[1667]={
static const uint16_t ucase_props_exceptions[1664]={
0xc850,0x20,2,0x130,0x131,0x4810,0x20,0x841,0x6b,1,0x212a,0x841,0x73,1,0x17f,0x5c50,
0x20,2,0x130,0x131,0x844,0x4b,1,0x212a,0x844,0x53,1,0x17f,0x806,0x3bc,0x39c,0x841,
0xe5,1,0x212b,0x8c0,1,0x2220,0x73,0x73,0x53,0x53,0x53,0x73,0x1e9e,0x844,0xc5,1,
@ -909,41 +909,40 @@ static const uint16_t ucase_props_exceptions[1667]={
0x3b7,0x3b9,0x397,0x399,0x880,0x2220,0x3ae,0x3b9,0x389,0x399,0x389,0x345,0x880,0x2220,0x3b7,0x342,
0x397,0x342,0x397,0x342,0x880,0x3330,0x3b7,0x342,0x3b9,0x397,0x342,0x399,0x397,0x342,0x345,0xc90,
9,0x220,0x3b7,0x3b9,0x397,0x399,0x880,0x3330,0x3b9,0x308,0x300,0x399,0x308,0x300,0x399,0x308,
0x300,0x8c0,1,0x3330,0x3b9,0x308,0x301,0x399,0x308,0x301,0x399,0x308,0x301,0x390,0x880,0x2220,
0x3b9,0x342,0x399,0x342,0x399,0x342,0x880,0x3330,0x3b9,0x308,0x342,0x399,0x308,0x342,0x399,0x308,
0x342,0x880,0x3330,0x3c5,0x308,0x300,0x3a5,0x308,0x300,0x3a5,0x308,0x300,0x8c0,1,0x3330,0x3c5,
0x308,0x301,0x3a5,0x308,0x301,0x3a5,0x308,0x301,0x3b0,0x880,0x2220,0x3c1,0x313,0x3a1,0x313,0x3a1,
0x313,0x880,0x2220,0x3c5,0x342,0x3a5,0x342,0x3a5,0x342,0x880,0x3330,0x3c5,0x308,0x342,0x3a5,0x308,
0x342,0x3a5,0x308,0x342,0x880,0x2220,0x1f7c,0x3b9,0x1ffa,0x399,0x1ffa,0x345,0x890,9,0x220,0x3c9,
0x3b9,0x3a9,0x399,0x880,0x2220,0x3ce,0x3b9,0x38f,0x399,0x38f,0x345,0x880,0x2220,0x3c9,0x342,0x3a9,
0x342,0x3a9,0x342,0x880,0x3330,0x3c9,0x342,0x3b9,0x3a9,0x342,0x399,0x3a9,0x342,0x345,0xc90,9,
0x220,0x3c9,0x3b9,0x3a9,0x399,0xc50,0x1d5d,1,0x3a9,0xc50,0x20bf,1,0x4b,0xc50,0x2046,1,
0xc5,0xc10,0x29f7,0xc10,0xee6,0xc10,0x29e7,0xc10,0x2a2b,0xc10,0x2a28,0xc10,0x2a1c,0xc10,0x29fd,0xc10,
0x2a1f,0xc10,0x2a1e,0xc10,0x2a3f,0xc10,0x1c60,0x841,0xa64b,1,0x1c88,0x844,0xa64a,1,0x1c88,0xc10,
0x8a04,0xc10,0xa528,0xc10,0xa544,0xc10,0xa54f,0xc10,0xa54b,0xc10,0xa541,0xc10,0xa512,0xc10,0xa52a,0xc10,
0xa515,0x810,0x3a0,0xc10,0xa543,0xc10,0x8a38,0xc10,0x3a0,0x806,0x13a0,0x13a0,0x806,0x13a1,0x13a1,0x806,
0x13a2,0x13a2,0x806,0x13a3,0x13a3,0x806,0x13a4,0x13a4,0x806,0x13a5,0x13a5,0x806,0x13a6,0x13a6,0x806,0x13a7,
0x13a7,0x806,0x13a8,0x13a8,0x806,0x13a9,0x13a9,0x806,0x13aa,0x13aa,0x806,0x13ab,0x13ab,0x806,0x13ac,0x13ac,
0x806,0x13ad,0x13ad,0x806,0x13ae,0x13ae,0x806,0x13af,0x13af,0x806,0x13b0,0x13b0,0x806,0x13b1,0x13b1,0x806,
0x13b2,0x13b2,0x806,0x13b3,0x13b3,0x806,0x13b4,0x13b4,0x806,0x13b5,0x13b5,0x806,0x13b6,0x13b6,0x806,0x13b7,
0x13b7,0x806,0x13b8,0x13b8,0x806,0x13b9,0x13b9,0x806,0x13ba,0x13ba,0x806,0x13bb,0x13bb,0x806,0x13bc,0x13bc,
0x806,0x13bd,0x13bd,0x806,0x13be,0x13be,0x806,0x13bf,0x13bf,0x806,0x13c0,0x13c0,0x806,0x13c1,0x13c1,0x806,
0x13c2,0x13c2,0x806,0x13c3,0x13c3,0x806,0x13c4,0x13c4,0x806,0x13c5,0x13c5,0x806,0x13c6,0x13c6,0x806,0x13c7,
0x13c7,0x806,0x13c8,0x13c8,0x806,0x13c9,0x13c9,0x806,0x13ca,0x13ca,0x806,0x13cb,0x13cb,0x806,0x13cc,0x13cc,
0x806,0x13cd,0x13cd,0x806,0x13ce,0x13ce,0x806,0x13cf,0x13cf,0x806,0x13d0,0x13d0,0x806,0x13d1,0x13d1,0x806,
0x13d2,0x13d2,0x806,0x13d3,0x13d3,0x806,0x13d4,0x13d4,0x806,0x13d5,0x13d5,0x806,0x13d6,0x13d6,0x806,0x13d7,
0x13d7,0x806,0x13d8,0x13d8,0x806,0x13d9,0x13d9,0x806,0x13da,0x13da,0x806,0x13db,0x13db,0x806,0x13dc,0x13dc,
0x806,0x13dd,0x13dd,0x806,0x13de,0x13de,0x806,0x13df,0x13df,0x806,0x13e0,0x13e0,0x806,0x13e1,0x13e1,0x806,
0x13e2,0x13e2,0x806,0x13e3,0x13e3,0x806,0x13e4,0x13e4,0x806,0x13e5,0x13e5,0x806,0x13e6,0x13e6,0x806,0x13e7,
0x13e7,0x806,0x13e8,0x13e8,0x806,0x13e9,0x13e9,0x806,0x13ea,0x13ea,0x806,0x13eb,0x13eb,0x806,0x13ec,0x13ec,
0x806,0x13ed,0x13ed,0x806,0x13ee,0x13ee,0x806,0x13ef,0x13ef,0x880,0x2220,0x66,0x66,0x46,0x46,0x46,
0x66,0x880,0x2220,0x66,0x69,0x46,0x49,0x46,0x69,0x880,0x2220,0x66,0x6c,0x46,0x4c,0x46,
0x6c,0x880,0x3330,0x66,0x66,0x69,0x46,0x46,0x49,0x46,0x66,0x69,0x880,0x3330,0x66,0x66,
0x6c,0x46,0x46,0x4c,0x46,0x66,0x6c,0x8c0,1,0x2220,0x73,0x74,0x53,0x54,0x53,0x74,
0xfb06,0x8c0,1,0x2220,0x73,0x74,0x53,0x54,0x53,0x74,0xfb05,0x880,0x2220,0x574,0x576,0x544,
0x546,0x544,0x576,0x880,0x2220,0x574,0x565,0x544,0x535,0x544,0x565,0x880,0x2220,0x574,0x56b,0x544,
0x53b,0x544,0x56b,0x880,0x2220,0x57e,0x576,0x54e,0x546,0x54e,0x576,0x880,0x2220,0x574,0x56d,0x544,
0x53d,0x544,0x56d
0x300,0x882,0x390,0x3330,0x3b9,0x308,0x301,0x399,0x308,0x301,0x399,0x308,0x301,0x880,0x2220,0x3b9,
0x342,0x399,0x342,0x399,0x342,0x880,0x3330,0x3b9,0x308,0x342,0x399,0x308,0x342,0x399,0x308,0x342,
0x880,0x3330,0x3c5,0x308,0x300,0x3a5,0x308,0x300,0x3a5,0x308,0x300,0x882,0x3b0,0x3330,0x3c5,0x308,
0x301,0x3a5,0x308,0x301,0x3a5,0x308,0x301,0x880,0x2220,0x3c1,0x313,0x3a1,0x313,0x3a1,0x313,0x880,
0x2220,0x3c5,0x342,0x3a5,0x342,0x3a5,0x342,0x880,0x3330,0x3c5,0x308,0x342,0x3a5,0x308,0x342,0x3a5,
0x308,0x342,0x880,0x2220,0x1f7c,0x3b9,0x1ffa,0x399,0x1ffa,0x345,0x890,9,0x220,0x3c9,0x3b9,0x3a9,
0x399,0x880,0x2220,0x3ce,0x3b9,0x38f,0x399,0x38f,0x345,0x880,0x2220,0x3c9,0x342,0x3a9,0x342,0x3a9,
0x342,0x880,0x3330,0x3c9,0x342,0x3b9,0x3a9,0x342,0x399,0x3a9,0x342,0x345,0xc90,9,0x220,0x3c9,
0x3b9,0x3a9,0x399,0xc50,0x1d5d,1,0x3a9,0xc50,0x20bf,1,0x4b,0xc50,0x2046,1,0xc5,0xc10,
0x29f7,0xc10,0xee6,0xc10,0x29e7,0xc10,0x2a2b,0xc10,0x2a28,0xc10,0x2a1c,0xc10,0x29fd,0xc10,0x2a1f,0xc10,
0x2a1e,0xc10,0x2a3f,0xc10,0x1c60,0x841,0xa64b,1,0x1c88,0x844,0xa64a,1,0x1c88,0xc10,0x8a04,0xc10,
0xa528,0xc10,0xa544,0xc10,0xa54f,0xc10,0xa54b,0xc10,0xa541,0xc10,0xa512,0xc10,0xa52a,0xc10,0xa515,0x810,
0x3a0,0xc10,0xa543,0xc10,0x8a38,0xc10,0x3a0,0x806,0x13a0,0x13a0,0x806,0x13a1,0x13a1,0x806,0x13a2,0x13a2,
0x806,0x13a3,0x13a3,0x806,0x13a4,0x13a4,0x806,0x13a5,0x13a5,0x806,0x13a6,0x13a6,0x806,0x13a7,0x13a7,0x806,
0x13a8,0x13a8,0x806,0x13a9,0x13a9,0x806,0x13aa,0x13aa,0x806,0x13ab,0x13ab,0x806,0x13ac,0x13ac,0x806,0x13ad,
0x13ad,0x806,0x13ae,0x13ae,0x806,0x13af,0x13af,0x806,0x13b0,0x13b0,0x806,0x13b1,0x13b1,0x806,0x13b2,0x13b2,
0x806,0x13b3,0x13b3,0x806,0x13b4,0x13b4,0x806,0x13b5,0x13b5,0x806,0x13b6,0x13b6,0x806,0x13b7,0x13b7,0x806,
0x13b8,0x13b8,0x806,0x13b9,0x13b9,0x806,0x13ba,0x13ba,0x806,0x13bb,0x13bb,0x806,0x13bc,0x13bc,0x806,0x13bd,
0x13bd,0x806,0x13be,0x13be,0x806,0x13bf,0x13bf,0x806,0x13c0,0x13c0,0x806,0x13c1,0x13c1,0x806,0x13c2,0x13c2,
0x806,0x13c3,0x13c3,0x806,0x13c4,0x13c4,0x806,0x13c5,0x13c5,0x806,0x13c6,0x13c6,0x806,0x13c7,0x13c7,0x806,
0x13c8,0x13c8,0x806,0x13c9,0x13c9,0x806,0x13ca,0x13ca,0x806,0x13cb,0x13cb,0x806,0x13cc,0x13cc,0x806,0x13cd,
0x13cd,0x806,0x13ce,0x13ce,0x806,0x13cf,0x13cf,0x806,0x13d0,0x13d0,0x806,0x13d1,0x13d1,0x806,0x13d2,0x13d2,
0x806,0x13d3,0x13d3,0x806,0x13d4,0x13d4,0x806,0x13d5,0x13d5,0x806,0x13d6,0x13d6,0x806,0x13d7,0x13d7,0x806,
0x13d8,0x13d8,0x806,0x13d9,0x13d9,0x806,0x13da,0x13da,0x806,0x13db,0x13db,0x806,0x13dc,0x13dc,0x806,0x13dd,
0x13dd,0x806,0x13de,0x13de,0x806,0x13df,0x13df,0x806,0x13e0,0x13e0,0x806,0x13e1,0x13e1,0x806,0x13e2,0x13e2,
0x806,0x13e3,0x13e3,0x806,0x13e4,0x13e4,0x806,0x13e5,0x13e5,0x806,0x13e6,0x13e6,0x806,0x13e7,0x13e7,0x806,
0x13e8,0x13e8,0x806,0x13e9,0x13e9,0x806,0x13ea,0x13ea,0x806,0x13eb,0x13eb,0x806,0x13ec,0x13ec,0x806,0x13ed,
0x13ed,0x806,0x13ee,0x13ee,0x806,0x13ef,0x13ef,0x880,0x2220,0x66,0x66,0x46,0x46,0x46,0x66,0x880,
0x2220,0x66,0x69,0x46,0x49,0x46,0x69,0x880,0x2220,0x66,0x6c,0x46,0x4c,0x46,0x6c,0x880,
0x3330,0x66,0x66,0x69,0x46,0x46,0x49,0x46,0x66,0x69,0x880,0x3330,0x66,0x66,0x6c,0x46,
0x46,0x4c,0x46,0x66,0x6c,0x882,0xfb06,0x2220,0x73,0x74,0x53,0x54,0x53,0x74,0x8c0,1,
0x2220,0x73,0x74,0x53,0x54,0x53,0x74,0xfb05,0x880,0x2220,0x574,0x576,0x544,0x546,0x544,0x576,
0x880,0x2220,0x574,0x565,0x544,0x535,0x544,0x565,0x880,0x2220,0x574,0x56b,0x544,0x53b,0x544,0x56b,
0x880,0x2220,0x57e,0x576,0x54e,0x546,0x54e,0x576,0x880,0x2220,0x574,0x56d,0x544,0x53d,0x544,0x56d
};
static const uint16_t ucase_props_unfold[370]={

View file

@ -679,14 +679,18 @@ void toUpper(uint32_t options,
// Adding one only to the final vowel in a longer sequence
// (which does not occur in normal writing) would require lookahead.
// Set the same flag as for preserving an existing dialytika.
if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
(upper == 0x399 || upper == 0x3A5)) {
data |= HAS_DIALYTIKA;
if ((data & HAS_VOWEL) != 0 &&
(state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) !=
0 &&
(upper == 0x399 || upper == 0x3A5)) {
data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) != 0 ? HAS_DIALYTIKA
: HAS_COMBINING_DIALYTIKA;
}
int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
if ((data & HAS_YPOGEGRAMMENI) != 0) {
numYpogegrammeni = 1;
}
const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0;
// Skip combining diacritics after this Greek letter.
int32_t nextNextIndex = nextIndex;
while (nextIndex < srcLength) {
@ -704,7 +708,8 @@ void toUpper(uint32_t options,
}
}
if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
nextState |= AFTER_VOWEL_WITH_ACCENT;
nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
: AFTER_VOWEL_WITH_COMBINING_ACCENT;
}
// Map according to Greek rules.
UBool addTonos = false;
@ -715,7 +720,7 @@ void toUpper(uint32_t options,
!isFollowedByCasedLetter(src, nextIndex, srcLength)) {
// Keep disjunctive "or" with (only) a tonos.
// We use the same "word boundary" conditions as for the Final_Sigma test.
if (i == nextIndex) {
if (hasPrecomposedAccent) {
upper = 0x389; // Preserve the precomposed form.
} else {
addTonos = true;

View file

@ -263,7 +263,8 @@ static const uint32_t HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALY
// State bits.
static const uint32_t AFTER_CASED = 1;
static const uint32_t AFTER_VOWEL_WITH_ACCENT = 2;
static const uint32_t AFTER_VOWEL_WITH_COMBINING_ACCENT = 2;
static const uint32_t AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT = 4;
uint32_t getLetterData(UChar32 c);

File diff suppressed because it is too large Load diff

View file

@ -11,6 +11,8 @@
#if !UCONFIG_NO_FORMATTING
#include <utility>
#include "unicode/ucurr.h"
#include "unicode/locid.h"
#include "unicode/ures.h"
@ -20,6 +22,7 @@
#include "unicode/usetiter.h"
#include "unicode/utf16.h"
#include "ustr_imp.h"
#include "bytesinkutil.h"
#include "charstr.h"
#include "cmemory.h"
#include "cstring.h"
@ -520,14 +523,18 @@ ucurr_forLocale(const char* locale,
return 0;
}
char currency[4]; // ISO currency codes are alpha3 codes.
UErrorCode localStatus = U_ZERO_ERROR;
int32_t resLen = uloc_getKeywordValue(locale, "currency",
currency, UPRV_LENGTHOF(currency), &localStatus);
if (U_SUCCESS(localStatus) && resLen == 3 && uprv_isInvariantString(currency, resLen)) {
CharString currency;
{
CharStringByteSink sink(&currency);
ulocimp_getKeywordValue(locale, "currency", sink, &localStatus);
}
int32_t resLen = currency.length();
if (U_SUCCESS(localStatus) && resLen == 3 && uprv_isInvariantString(currency.data(), resLen)) {
if (resLen < buffCapacity) {
T_CString_toUpperCase(currency);
u_charsToUChars(currency, buff, resLen);
T_CString_toUpperCase(currency.data());
u_charsToUChars(currency.data(), buff, resLen);
}
return u_terminateUChars(buff, buffCapacity, resLen, ec);
}
@ -597,11 +604,15 @@ ucurr_forLocale(const char* locale,
if ((U_FAILURE(localStatus)) && strchr(id, '_') != 0) {
// We don't know about it. Check to see if we support the variant.
uloc_getParent(locale, id, UPRV_LENGTHOF(id), ec);
CharString parent;
{
CharStringByteSink sink(&parent);
ulocimp_getParent(locale, sink, ec);
}
*ec = U_USING_FALLBACK_WARNING;
// TODO: Loop over the shortened id rather than recursing and
// TODO: Loop over the parent rather than recursing and
// looking again for a currency keyword.
return ucurr_forLocale(id, buff, buffCapacity, ec);
return ucurr_forLocale(parent.data(), buff, buffCapacity, ec);
}
if (*ec == U_ZERO_ERROR || localStatus != U_ZERO_ERROR) {
// There is nothing to fallback to. Report the failure/warning if possible.
@ -624,20 +635,22 @@ ucurr_forLocale(const char* locale,
* @return true if the fallback happened; false if locale is already
* root ("").
*/
static UBool fallback(char *loc) {
if (!*loc) {
static UBool fallback(CharString& loc) {
if (loc.isEmpty()) {
return false;
}
UErrorCode status = U_ZERO_ERROR;
if (uprv_strcmp(loc, "en_GB") == 0) {
if (loc == "en_GB") {
// HACK: See #13368. We need "en_GB" to fall back to "en_001" instead of "en"
// in order to consume the correct data strings. This hack will be removed
// when proper data sink loading is implemented here.
// NOTE: "001" adds 1 char over "GB". However, both call sites allocate
// arrays with length ULOC_FULLNAME_CAPACITY (plenty of room for en_001).
uprv_strcpy(loc + 3, "001");
loc.truncate(3);
loc.append("001", status);
} else {
uloc_getParent(loc, loc, (int32_t)uprv_strlen(loc), &status);
CharString tmp;
CharStringByteSink sink(&tmp);
ulocimp_getParent(loc.data(), sink, &status);
loc = std::move(tmp);
}
/*
char *i = uprv_strrchr(loc, '_');
@ -692,9 +705,12 @@ ucurr_getName(const char16_t* currency,
// this function.
UErrorCode ec2 = U_ZERO_ERROR;
char loc[ULOC_FULLNAME_CAPACITY];
uloc_getName(locale, loc, sizeof(loc), &ec2);
if (U_FAILURE(ec2) || ec2 == U_STRING_NOT_TERMINATED_WARNING) {
CharString loc;
{
CharStringByteSink sink(&loc);
ulocimp_getName(locale, sink, &ec2);
}
if (U_FAILURE(ec2)) {
*ec = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
@ -707,7 +723,7 @@ ucurr_getName(const char16_t* currency,
const char16_t* s = nullptr;
ec2 = U_ZERO_ERROR;
LocalUResourceBundlePointer rb(ures_open(U_ICUDATA_CURR, loc, &ec2));
LocalUResourceBundlePointer rb(ures_open(U_ICUDATA_CURR, loc.data(), &ec2));
if (nameStyle == UCURR_NARROW_SYMBOL_NAME || nameStyle == UCURR_FORMAL_SYMBOL_NAME || nameStyle == UCURR_VARIANT_SYMBOL_NAME) {
CharString key;
@ -791,9 +807,12 @@ ucurr_getPluralName(const char16_t* currency,
// this function.
UErrorCode ec2 = U_ZERO_ERROR;
char loc[ULOC_FULLNAME_CAPACITY];
uloc_getName(locale, loc, sizeof(loc), &ec2);
if (U_FAILURE(ec2) || ec2 == U_STRING_NOT_TERMINATED_WARNING) {
CharString loc;
{
CharStringByteSink sink(&loc);
ulocimp_getName(locale, sink, &ec2);
}
if (U_FAILURE(ec2)) {
*ec = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
@ -803,7 +822,7 @@ ucurr_getPluralName(const char16_t* currency,
const char16_t* s = nullptr;
ec2 = U_ZERO_ERROR;
UResourceBundle* rb = ures_open(U_ICUDATA_CURR, loc, &ec2);
UResourceBundle* rb = ures_open(U_ICUDATA_CURR, loc.data(), &ec2);
rb = ures_getByKey(rb, CURRENCYPLURALS, rb, &ec2);
@ -904,13 +923,17 @@ getCurrencyNameCount(const char* loc, int32_t* total_currency_name_count, int32_
*total_currency_name_count = 0;
*total_currency_symbol_count = 0;
const char16_t* s = nullptr;
char locale[ULOC_FULLNAME_CAPACITY] = "";
uprv_strcpy(locale, loc);
CharString locale;
{
UErrorCode status = U_ZERO_ERROR;
locale.append(loc, status);
if (U_FAILURE(status)) { return; }
}
const icu::Hashtable *currencySymbolsEquiv = getCurrSymbolsEquiv();
for (;;) {
UErrorCode ec2 = U_ZERO_ERROR;
// TODO: ures_openDirect?
UResourceBundle* rb = ures_open(U_ICUDATA_CURR, locale, &ec2);
UResourceBundle* rb = ures_open(U_ICUDATA_CURR, locale.data(), &ec2);
UResourceBundle* curr = ures_getByKey(rb, CURRENCIES, nullptr, &ec2);
int32_t n = ures_getSize(curr);
for (int32_t i=0; i<n; ++i) {
@ -979,14 +1002,17 @@ collectCurrencyNames(const char* locale,
// Look up the Currencies resource for the given locale.
UErrorCode ec2 = U_ZERO_ERROR;
char loc[ULOC_FULLNAME_CAPACITY] = "";
uloc_getName(locale, loc, sizeof(loc), &ec2);
if (U_FAILURE(ec2) || ec2 == U_STRING_NOT_TERMINATED_WARNING) {
CharString loc;
{
CharStringByteSink sink(&loc);
ulocimp_getName(locale, sink, &ec2);
}
if (U_FAILURE(ec2)) {
ec = U_ILLEGAL_ARGUMENT_ERROR;
}
// Get maximum currency name count first.
getCurrencyNameCount(loc, total_currency_name_count, total_currency_symbol_count);
getCurrencyNameCount(loc.data(), total_currency_name_count, total_currency_symbol_count);
*currencyNames = (CurrencyNameStruct*)uprv_malloc
(sizeof(CurrencyNameStruct) * (*total_currency_name_count));
@ -1014,7 +1040,7 @@ collectCurrencyNames(const char* locale,
for (int32_t localeLevel = 0; ; ++localeLevel) {
ec2 = U_ZERO_ERROR;
// TODO: ures_openDirect
UResourceBundle* rb = ures_open(U_ICUDATA_CURR, loc, &ec2);
UResourceBundle* rb = ures_open(U_ICUDATA_CURR, loc.data(), &ec2);
UResourceBundle* curr = ures_getByKey(rb, CURRENCIES, nullptr, &ec2);
int32_t n = ures_getSize(curr);
for (int32_t i=0; i<n; ++i) {

View file

@ -1196,7 +1196,7 @@ doOpenChoice(const char *path, const char *type, const char *name,
*p = U_FILE_SEP_CHAR;
}
#if defined (UDATA_DEBUG)
fprintf(stderr, "Changed path from [%s] to [%s]\n", path, altSepPath.s);
fprintf(stderr, "Changed path from [%s] to [%s]\n", path, altSepPath.data());
#endif
path = altSepPath.data();
}

View file

@ -103,12 +103,12 @@ static const char * const LANGUAGES[] = {
"ba", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
"be", "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
"bgc", "bgn", "bho", "bi", "bik", "bin", "bjn", "bkm", "bla",
"bm", "bn", "bo", "bpy", "bqi", "br", "bra", "brh",
"blo", "bm", "bn", "bo", "bpy", "bqi", "br", "bra", "brh",
"brx", "bs", "bss", "bua", "bug", "bum", "byn", "byv",
"ca", "cad", "car", "cay", "cch", "ccp", "ce", "ceb", "cgg",
"ch", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
"chr", "chy", "ckb", "co", "cop", "cps", "cr", "crh",
"cs", "csb", "cu", "cv", "cy",
"cs", "csb", "csw", "cu", "cv", "cy",
"da", "dak", "dar", "dav", "de", "del", "den", "dgr",
"din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
"dyo", "dyu", "dz", "dzg",
@ -135,7 +135,7 @@ static const char * const LANGUAGES[] = {
"kkj", "kl", "kln", "km", "kmb", "kn", "ko", "koi",
"kok", "kos", "kpe", "kr", "krc", "kri", "krj", "krl",
"kru", "ks", "ksb", "ksf", "ksh", "ku", "kum", "kut",
"kv", "kw", "ky",
"kv", "kw", "kxv", "ky",
"la", "lad", "lag", "lah", "lam", "lb", "lez", "lfn",
"lg", "li", "lij", "liv", "lkt", "lmo", "ln", "lo",
"lol", "loz", "lrc", "lt", "ltg", "lu", "lua", "lui",
@ -169,14 +169,14 @@ static const char * const LANGUAGES[] = {
"sv", "sw", "swb", "syc", "syr", "szl",
"ta", "tcy", "te", "tem", "teo", "ter", "tet", "tg",
"th", "ti", "tig", "tiv", "tk", "tkl", "tkr",
"tlh", "tli", "tly", "tmh", "tn", "to", "tog", "tpi",
"tlh", "tli", "tly", "tmh", "tn", "to", "tog", "tok", "tpi",
"tr", "tru", "trv", "ts", "tsd", "tsi", "tt", "ttt",
"tum", "tvl", "tw", "twq", "ty", "tyv", "tzm",
"udm", "ug", "uga", "uk", "umb", "und", "ur", "uz",
"vai", "ve", "vec", "vep", "vi", "vls", "vmf", "vo",
"vot", "vro", "vun",
"vai", "ve", "vec", "vep", "vi", "vls", "vmf", "vmw",
"vo", "vot", "vro", "vun",
"wa", "wae", "wal", "war", "was", "wbp", "wo", "wuu",
"xal", "xh", "xmf", "xog",
"xal", "xh", "xmf", "xnr", "xog",
"yao", "yap", "yav", "ybb", "yi", "yo", "yrl", "yue",
"za", "zap", "zbl", "zea", "zen", "zgh", "zh", "zu",
"zun", "zxx", "zza",
@ -220,12 +220,12 @@ static const char * const LANGUAGES_3[] = {
"bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
"bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
"bgc", "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
"bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
"blo", "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
"brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
"cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
"cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
"chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
"ces", "csb", "chu", "chv", "cym",
"ces", "csb", "csw", "chu", "chv", "cym",
"dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
"din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
"dyo", "dyu", "dzo", "dzg",
@ -252,7 +252,7 @@ static const char * const LANGUAGES_3[] = {
"kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
"kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
"kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
"kom", "cor", "kir",
"kom", "cor", "kxv", "kir",
"lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
"lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
"lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
@ -286,14 +286,14 @@ static const char * const LANGUAGES_3[] = {
"swe", "swa", "swb", "syc", "syr", "szl",
"tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
"tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr",
"tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
"tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tok", "tpi",
"tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
"tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
"udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
"vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
"vot", "vro", "vun",
"vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vmw",
"vol", "vot", "vro", "vun",
"wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
"xal", "xho", "xmf", "xog",
"xal", "xho", "xmf", "xnr", "xog",
"yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
"zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
"zun", "zxx", "zza",
@ -477,25 +477,6 @@ static const CanonicalizationMap CANONICALIZE_MAP[] = {
/* ### BCP47 Conversion *******************************************/
/* Test if the locale id has BCP47 u extension and does not have '@' */
#define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == nullptr && getShortestSubtagLength(localeID) == 1)
/* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
static const char* _ConvertBCP47(
const char* id, char* buffer, int32_t length,
UErrorCode* err, int32_t* pLocaleIdSize) {
const char* finalID;
int32_t localeIDSize = uloc_forLanguageTag(id, buffer, length, nullptr, err);
if (localeIDSize <= 0 || U_FAILURE(*err) || *err == U_STRING_NOT_TERMINATED_WARNING) {
finalID=id;
if (*err == U_STRING_NOT_TERMINATED_WARNING) {
*err = U_BUFFER_OVERFLOW_ERROR;
}
} else {
finalID=buffer;
}
if (pLocaleIdSize != nullptr) {
*pLocaleIdSize = localeIDSize;
}
return finalID;
}
/* Gets the size of the shortest subtag in the given localeID. */
static int32_t getShortestSubtagLength(const char *localeID) {
int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
@ -762,7 +743,7 @@ ulocimp_getKeywordValue(const char* localeID,
char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
if(status && U_SUCCESS(*status) && localeID) {
char tempBuffer[ULOC_FULLNAME_CAPACITY];
CharString tempBuffer;
const char* tmpLocaleID;
if (keywordName == nullptr || keywordName[0] == 0) {
@ -776,8 +757,9 @@ ulocimp_getKeywordValue(const char* localeID,
}
if (_hasBCP47Extension(localeID)) {
tmpLocaleID = _ConvertBCP47(localeID, tempBuffer,
sizeof(tempBuffer), status, nullptr);
CharStringByteSink sink(&tempBuffer);
ulocimp_forLanguageTag(localeID, -1, sink, nullptr, status);
tmpLocaleID = U_SUCCESS(*status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
} else {
tmpLocaleID=localeID;
}
@ -1406,7 +1388,7 @@ U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywords(const char* localeID,
UErrorCode* status)
{
char tempBuffer[ULOC_FULLNAME_CAPACITY];
CharString tempBuffer;
const char* tmpLocaleID;
if(status==nullptr || U_FAILURE(*status)) {
@ -1414,8 +1396,9 @@ uloc_openKeywords(const char* localeID,
}
if (_hasBCP47Extension(localeID)) {
tmpLocaleID = _ConvertBCP47(localeID, tempBuffer,
sizeof(tempBuffer), status, nullptr);
CharStringByteSink sink(&tempBuffer);
ulocimp_forLanguageTag(localeID, -1, sink, nullptr, status);
tmpLocaleID = U_SUCCESS(*status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
} else {
if (localeID==nullptr) {
localeID=uloc_getDefault();
@ -1489,7 +1472,7 @@ _canonicalize(const char* localeID,
}
int32_t j, fieldCount=0, scriptSize=0, variantSize=0;
PreflightingLocaleIDBuffer tempBuffer; // if localeID has a BCP47 extension, tmpLocaleID points to this
CharString tempBuffer; // if localeID has a BCP47 extension, tmpLocaleID points to this
CharString localeIDWithHyphens; // if localeID has a BPC47 extension and have _, tmpLocaleID points to this
const char* origLocaleID;
const char* tmpLocaleID;
@ -1512,13 +1495,9 @@ _canonicalize(const char* localeID,
}
}
do {
// After this call tmpLocaleID may point to localeIDPtr which may
// point to either localeID or localeIDWithHyphens.data().
tmpLocaleID = _ConvertBCP47(localeIDPtr, tempBuffer.getBuffer(),
tempBuffer.getCapacity(), err,
&(tempBuffer.requestedCapacity));
} while (tempBuffer.needToTryAgain(err));
CharStringByteSink tempSink(&tempBuffer);
ulocimp_forLanguageTag(localeIDPtr, -1, tempSink, nullptr, err);
tmpLocaleID = U_SUCCESS(*err) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeIDPtr;
} else {
if (localeID==nullptr) {
localeID=uloc_getDefault();
@ -1676,12 +1655,39 @@ uloc_getParent(const char* localeID,
char* parent,
int32_t parentCapacity,
UErrorCode* err)
{
if (U_FAILURE(*err)) {
return 0;
}
CheckedArrayByteSink sink(parent, parentCapacity);
ulocimp_getParent(localeID, sink, err);
int32_t reslen = sink.NumberOfBytesAppended();
if (U_FAILURE(*err)) {
return reslen;
}
if (sink.Overflowed()) {
*err = U_BUFFER_OVERFLOW_ERROR;
} else {
u_terminateChars(parent, parentCapacity, reslen, err);
}
return reslen;
}
U_CAPI void U_EXPORT2
ulocimp_getParent(const char* localeID,
icu::ByteSink& sink,
UErrorCode* err)
{
const char *lastUnderscore;
int32_t i;
if (U_FAILURE(*err))
return 0;
return;
if (localeID == nullptr)
localeID = uloc_getDefault();
@ -1697,13 +1703,9 @@ uloc_getParent(const char* localeID,
if (uprv_strnicmp(localeID, "und_", 4) == 0) {
localeID += 3;
i -= 3;
uprv_memmove(parent, localeID, uprv_min(i, parentCapacity));
} else if (parent != localeID) {
uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
}
sink.Append(localeID, i);
}
return u_terminateChars(parent, parentCapacity, i, err);
}
U_CAPI int32_t U_EXPORT2
@ -1795,7 +1797,7 @@ uloc_getVariant(const char* localeID,
int32_t variantCapacity,
UErrorCode* err)
{
char tempBuffer[ULOC_FULLNAME_CAPACITY];
CharString tempBuffer;
const char* tmpLocaleID;
int32_t i=0;
@ -1804,7 +1806,9 @@ uloc_getVariant(const char* localeID,
}
if (_hasBCP47Extension(localeID)) {
tmpLocaleID =_ConvertBCP47(localeID, tempBuffer, sizeof(tempBuffer), err, nullptr);
CharStringByteSink sink(&tempBuffer);
ulocimp_forLanguageTag(localeID, -1, sink, nullptr, err);
tmpLocaleID = U_SUCCESS(*err) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
} else {
if (localeID==nullptr) {
localeID=uloc_getDefault();

View file

@ -1326,14 +1326,23 @@ _appendKeywordsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool st
attrBufLength = 0;
for (; i < len; i++) {
if (buf[i] != '-') {
attrBuf[attrBufLength++] = buf[i];
if (static_cast<size_t>(attrBufLength) < sizeof(attrBuf)) {
attrBuf[attrBufLength++] = buf[i];
} else {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
} else {
i++;
break;
}
}
if (attrBufLength > 0) {
attrBuf[attrBufLength] = 0;
if (static_cast<size_t>(attrBufLength) < sizeof(attrBuf)) {
attrBuf[attrBufLength] = 0;
} else {
*status = U_STRING_NOT_TERMINATED_WARNING;
}
} else if (i >= len){
break;
@ -1879,11 +1888,8 @@ static void
_appendPrivateuseToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) {
(void)hadPosix;
char buf[ULOC_FULLNAME_CAPACITY];
char tmpAppend[ULOC_FULLNAME_CAPACITY];
UErrorCode tmpStatus = U_ZERO_ERROR;
int32_t len, i;
int32_t reslen = 0;
int32_t capacity = sizeof tmpAppend;
if (U_FAILURE(*status)) {
return;
@ -1936,37 +1942,18 @@ _appendPrivateuseToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool
}
if (writeValue) {
if (reslen < capacity) {
tmpAppend[reslen++] = SEP;
}
sink.Append("-", 1);
if (firstValue) {
if (reslen < capacity) {
tmpAppend[reslen++] = *PRIVATEUSE_KEY;
}
if (reslen < capacity) {
tmpAppend[reslen++] = SEP;
}
len = (int32_t)uprv_strlen(PRIVUSE_VARIANT_PREFIX);
if (reslen < capacity) {
uprv_memcpy(tmpAppend + reslen, PRIVUSE_VARIANT_PREFIX, uprv_min(len, capacity - reslen));
}
reslen += len;
if (reslen < capacity) {
tmpAppend[reslen++] = SEP;
}
sink.Append(PRIVATEUSE_KEY, UPRV_LENGTHOF(PRIVATEUSE_KEY) - 1);
sink.Append("-", 1);
sink.Append(PRIVUSE_VARIANT_PREFIX, UPRV_LENGTHOF(PRIVUSE_VARIANT_PREFIX) - 1);
sink.Append("-", 1);
firstValue = false;
}
len = (int32_t)uprv_strlen(pPriv);
if (reslen < capacity) {
uprv_memcpy(tmpAppend + reslen, pPriv, uprv_min(len, capacity - reslen));
}
reslen += len;
sink.Append(pPriv, len);
}
}
/* reset private use starting position */
@ -1976,15 +1963,6 @@ _appendPrivateuseToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool
}
p++;
}
if (U_FAILURE(*status)) {
return;
}
}
if (U_SUCCESS(*status)) {
len = reslen;
sink.Append(tmpAppend, len);
}
}
@ -2092,12 +2070,13 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
int32_t oldTagLength = tagLen;
if (tagLen < newTagLength) {
uprv_free(tagBuf);
tagBuf = (char*)uprv_malloc(newTagLength + 1);
// Change t->buf after the free and before return to avoid the second double free in
// the destructor of t when t is out of scope.
t->buf = tagBuf = (char*)uprv_malloc(newTagLength + 1);
if (tagBuf == nullptr) {
*status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
t->buf = tagBuf;
tagLen = newTagLength;
}
parsedLenDelta = checkLegacyLen - replacementLen;
@ -2646,53 +2625,18 @@ ulocimp_toLanguageTag(const char* localeID,
UBool strict,
UErrorCode* status) {
icu::CharString canonical;
int32_t reslen;
UErrorCode tmpStatus = U_ZERO_ERROR;
UBool hadPosix = false;
const char* pKeywordStart;
/* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "". See #6835 */
int32_t resultCapacity = static_cast<int32_t>(uprv_strlen(localeID));
if (resultCapacity > 0) {
char* buffer;
for (;;) {
buffer = canonical.getAppendBuffer(
/*minCapacity=*/resultCapacity,
/*desiredCapacityHint=*/resultCapacity,
resultCapacity,
tmpStatus);
if (U_FAILURE(tmpStatus)) {
*status = tmpStatus;
return;
}
reslen =
uloc_canonicalize(localeID, buffer, resultCapacity, &tmpStatus);
if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) {
break;
}
resultCapacity = reslen;
tmpStatus = U_ZERO_ERROR;
}
if (U_FAILURE(tmpStatus)) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
canonical.append(buffer, reslen, tmpStatus);
if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
tmpStatus = U_ZERO_ERROR; // Terminators provided by CharString.
}
if (U_FAILURE(tmpStatus)) {
*status = tmpStatus;
return;
}
{
icu::CharStringByteSink canonicalSink(&canonical);
ulocimp_canonicalize(localeID, canonicalSink, &tmpStatus);
}
if (U_FAILURE(tmpStatus)) {
*status = tmpStatus;
return;
}
/* For handling special case - private use only tag */

99
thirdparty/icu4c/common/ulocale.cpp vendored Normal file
View file

@ -0,0 +1,99 @@
// © 2023 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
//
#include "unicode/errorcode.h"
#include "unicode/stringpiece.h"
#include "unicode/utypes.h"
#include "unicode/ustring.h"
#include "unicode/ulocale.h"
#include "unicode/locid.h"
#include "charstr.h"
#include "cmemory.h"
#include "ustr_imp.h"
U_NAMESPACE_USE
#define EXTERNAL(i) (reinterpret_cast<ULocale*>(i))
#define CONST_INTERNAL(e) (reinterpret_cast<const icu::Locale*>(e))
#define INTERNAL(e) (reinterpret_cast<icu::Locale*>(e))
ULocale*
ulocale_openForLocaleID(const char* localeID, int32_t length, UErrorCode* err) {
CharString str(length < 0 ? StringPiece(localeID) : StringPiece(localeID, length), *err);
if (U_FAILURE(*err)) return nullptr;
return EXTERNAL(icu::Locale::createFromName(str.data()).clone());
}
ULocale*
ulocale_openForLanguageTag(const char* tag, int32_t length, UErrorCode* err) {
Locale l = icu::Locale::forLanguageTag(length < 0 ? StringPiece(tag) : StringPiece(tag, length), *err);
if (U_FAILURE(*err)) return nullptr;
return EXTERNAL(l.clone());
}
void
ulocale_close(ULocale* locale) {
delete INTERNAL(locale);
}
#define IMPL_ULOCALE_STRING_GETTER(N1, N2) \
const char* ulocale_get ## N1(const ULocale* locale) { \
if (locale == nullptr) return nullptr; \
return CONST_INTERNAL(locale)->get ## N2(); \
}
#define IMPL_ULOCALE_STRING_IDENTICAL_GETTER(N) IMPL_ULOCALE_STRING_GETTER(N, N)
#define IMPL_ULOCALE_GET_KEYWORD_VALUE(N) \
int32_t ulocale_get ##N ( \
const ULocale* locale, const char* keyword, int32_t keywordLength, \
char* valueBuffer, int32_t bufferCapacity, UErrorCode *err) { \
if (U_FAILURE(*err)) return 0; \
if (locale == nullptr) { \
*err = U_ILLEGAL_ARGUMENT_ERROR; \
return 0; \
} \
CheckedArrayByteSink sink(valueBuffer, bufferCapacity); \
CONST_INTERNAL(locale)->get ## N( \
keywordLength < 0 ? StringPiece(keyword) : StringPiece(keyword, keywordLength), \
sink, *err); \
int32_t reslen = sink.NumberOfBytesAppended(); \
if (U_FAILURE(*err)) { \
return reslen; \
} \
if (sink.Overflowed()) { \
*err = U_BUFFER_OVERFLOW_ERROR; \
} else { \
u_terminateChars(valueBuffer, bufferCapacity, reslen, err); \
} \
return reslen; \
}
#define IMPL_ULOCALE_GET_KEYWORDS(N) \
UEnumeration* ulocale_get ## N(const ULocale* locale, UErrorCode *err) { \
if (U_FAILURE(*err)) return nullptr; \
if (locale == nullptr) { \
*err = U_ILLEGAL_ARGUMENT_ERROR; \
return nullptr; \
} \
return uenum_openFromStringEnumeration( \
CONST_INTERNAL(locale)->create ## N(*err), err); \
}
IMPL_ULOCALE_STRING_IDENTICAL_GETTER(Language)
IMPL_ULOCALE_STRING_IDENTICAL_GETTER(Script)
IMPL_ULOCALE_STRING_GETTER(Region, Country)
IMPL_ULOCALE_STRING_IDENTICAL_GETTER(Variant)
IMPL_ULOCALE_STRING_GETTER(LocaleID, Name)
IMPL_ULOCALE_STRING_IDENTICAL_GETTER(BaseName)
IMPL_ULOCALE_GET_KEYWORD_VALUE(KeywordValue)
IMPL_ULOCALE_GET_KEYWORD_VALUE(UnicodeKeywordValue)
IMPL_ULOCALE_GET_KEYWORDS(Keywords)
IMPL_ULOCALE_GET_KEYWORDS(UnicodeKeywords)
bool ulocale_isBogus(const ULocale* locale) {
if (locale == nullptr) return false;
return CONST_INTERNAL(locale)->isBogus();
}
/*eof*/

156
thirdparty/icu4c/common/ulocbuilder.cpp vendored Normal file
View file

@ -0,0 +1,156 @@
// © 2023 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#include <utility>
#include "unicode/bytestream.h"
#include "unicode/localebuilder.h"
#include "unicode/locid.h"
#include "unicode/stringpiece.h"
#include "unicode/umachine.h"
#include "unicode/ulocbuilder.h"
#include "cstring.h"
#include "ustr_imp.h"
using icu::CheckedArrayByteSink;
using icu::StringPiece;
#define EXTERNAL(i) (reinterpret_cast<ULocaleBuilder*>(i))
#define INTERNAL(e) (reinterpret_cast<icu::LocaleBuilder*>(e))
#define CONST_INTERNAL(e) (reinterpret_cast<const icu::LocaleBuilder*>(e))
ULocaleBuilder* ulocbld_open() {
return EXTERNAL(new icu::LocaleBuilder());
}
void ulocbld_close(ULocaleBuilder* builder) {
if (builder == nullptr) return;
delete INTERNAL(builder);
}
void ulocbld_setLocale(ULocaleBuilder* builder, const char* locale, int32_t length) {
if (builder == nullptr) return;
icu::Locale l;
if (length < 0 || locale[length] == '\0') {
l = icu::Locale(locale);
} else {
if (length >= ULOC_FULLNAME_CAPACITY) {
l.setToBogus();
} else {
// locale is not null termined but Locale API require one.
// Create a null termined version in buf.
char buf[ULOC_FULLNAME_CAPACITY];
uprv_memcpy(buf, locale, length);
buf[length] = '\0';
l = icu::Locale(buf);
}
}
INTERNAL(builder)->setLocale(l);
}
void
ulocbld_adoptULocale(ULocaleBuilder* builder, ULocale* locale) {
if (builder == nullptr) return;
INTERNAL(builder)->setLocale(*(reinterpret_cast<const icu::Locale*>(locale)));
ulocale_close(locale);
}
#define STRING_PIECE(s, l) ((l)<0 ? StringPiece(s) : StringPiece((s), (l)))
#define IMPL_ULOCBLD_SETTER(N) \
void ulocbld_##N(ULocaleBuilder* bld, const char* s, int32_t l) { \
if (bld == nullptr) return; \
INTERNAL(bld)->N(STRING_PIECE(s,l)); \
}
IMPL_ULOCBLD_SETTER(setLanguageTag)
IMPL_ULOCBLD_SETTER(setLanguage)
IMPL_ULOCBLD_SETTER(setScript)
IMPL_ULOCBLD_SETTER(setRegion)
IMPL_ULOCBLD_SETTER(setVariant)
IMPL_ULOCBLD_SETTER(addUnicodeLocaleAttribute)
IMPL_ULOCBLD_SETTER(removeUnicodeLocaleAttribute)
void ulocbld_setExtension(ULocaleBuilder* builder, char key, const char* value, int32_t length) {
if (builder == nullptr) return;
INTERNAL(builder)->setExtension(key, STRING_PIECE(value, length));
}
void ulocbld_setUnicodeLocaleKeyword(
ULocaleBuilder* builder, const char* key, int32_t keyLength,
const char* type, int32_t typeLength) {
if (builder == nullptr) return;
INTERNAL(builder)->setUnicodeLocaleKeyword(
STRING_PIECE(key, keyLength), STRING_PIECE(type, typeLength));
}
void ulocbld_clear(ULocaleBuilder* builder) {
if (builder == nullptr) return;
INTERNAL(builder)->clear();
}
void ulocbld_clearExtensions(ULocaleBuilder* builder) {
if (builder == nullptr) return;
INTERNAL(builder)->clearExtensions();
}
ULocale* ulocbld_buildULocale(ULocaleBuilder* builder, UErrorCode* err) {
if (builder == nullptr) {
*err = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
icu::Locale l = INTERNAL(builder)->build(*err);
if (U_FAILURE(*err)) return nullptr;
icu::Locale* r = l.clone();
if (r == nullptr) {
*err = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
return reinterpret_cast<ULocale*>(r);
}
int32_t ulocbld_buildLocaleID(ULocaleBuilder* builder,
char* buffer, int32_t bufferCapacity, UErrorCode* err) {
if (builder == nullptr) {
*err = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
icu::Locale l = INTERNAL(builder)->build(*err);
if (U_FAILURE(*err)) return 0;
int32_t length = (int32_t)(uprv_strlen(l.getName()));
if (0 < length && length <= bufferCapacity) {
uprv_memcpy(buffer, l.getName(), length);
}
return u_terminateChars(buffer, bufferCapacity, length, err);
}
int32_t ulocbld_buildLanguageTag(ULocaleBuilder* builder,
char* buffer, int32_t bufferCapacity, UErrorCode* err) {
if (builder == nullptr) {
*err = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
icu::Locale l = INTERNAL(builder)->build(*err);
if (U_FAILURE(*err)) return 0;
CheckedArrayByteSink sink(buffer, bufferCapacity);
l.toLanguageTag(sink, *err);
int32_t reslen = sink.NumberOfBytesAppended();
if (U_FAILURE(*err)) {
return reslen;
}
if (sink.Overflowed()) {
*err = U_BUFFER_OVERFLOW_ERROR;
} else {
u_terminateChars(buffer, bufferCapacity, reslen, err);
}
return reslen;
}
UBool ulocbld_copyErrorTo(const ULocaleBuilder* builder, UErrorCode *outErrorCode) {
if (builder == nullptr) {
*outErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
return true;
}
return CONST_INTERNAL(builder)->copyErrorTo(*outErrorCode);
}

View file

@ -92,6 +92,11 @@ ulocimp_getKeywordValue(const char* localeID,
icu::ByteSink& sink,
UErrorCode* status);
U_CAPI void U_EXPORT2
ulocimp_getParent(const char* localeID,
icu::ByteSink& sink,
UErrorCode* err);
/**
* Writes a well-formed language tag for this locale ID.
*
@ -237,6 +242,7 @@ ulocimp_addLikelySubtags(const char* localeID,
*
* @param localeID The locale to minimize
* @param sink The output sink receiving the maximized locale
* @param favorScript favor to keep script if true, region if false.
* @param err Error information if minimizing the locale failed. If the length
* of the localeID and the null-terminator is greater than the maximum allowed size,
* or the localeId is not well-formed, the error code is U_ILLEGAL_ARGUMENT_ERROR.
@ -245,6 +251,7 @@ ulocimp_addLikelySubtags(const char* localeID,
U_CAPI void U_EXPORT2
ulocimp_minimizeSubtags(const char* localeID,
icu::ByteSink& sink,
bool favorScript,
UErrorCode* err);
U_CAPI const char * U_EXPORT2
@ -307,72 +314,4 @@ U_CAPI const char* const* ulocimp_getKnownCanonicalizedLocaleForTest(int32_t* le
// Return true if the value is already canonicalized.
U_CAPI bool ulocimp_isCanonicalizedLocaleForTest(const char* localeName);
/**
* A utility class for handling locale IDs that may be longer than ULOC_FULLNAME_CAPACITY.
* This encompasses all of the logic to allocate a temporary locale ID buffer on the stack,
* and then, if it's not big enough, reallocate it on the heap and try again.
*
* You use it like this:
* UErrorCode err = U_ZERO_ERROR;
*
* PreflightingLocaleIDBuffer tempBuffer;
* do {
* tempBuffer.requestedCapacity = uloc_doSomething(localeID, tempBuffer.getBuffer(), tempBuffer.getCapacity(), &err);
* } while (tempBuffer.needToTryAgain(&err));
* if (U_SUCCESS(err)) {
* uloc_doSomethingWithTheResult(tempBuffer.getBuffer());
* }
*/
class PreflightingLocaleIDBuffer {
private:
char stackBuffer[ULOC_FULLNAME_CAPACITY];
char* heapBuffer = nullptr;
int32_t capacity = ULOC_FULLNAME_CAPACITY;
public:
int32_t requestedCapacity = ULOC_FULLNAME_CAPACITY;
// No heap allocation. Use only on the stack.
static void* U_EXPORT2 operator new(size_t) noexcept = delete;
static void* U_EXPORT2 operator new[](size_t) noexcept = delete;
#if U_HAVE_PLACEMENT_NEW
static void* U_EXPORT2 operator new(size_t, void*) noexcept = delete;
#endif
PreflightingLocaleIDBuffer() {}
~PreflightingLocaleIDBuffer() { uprv_free(heapBuffer); }
char* getBuffer() {
if (heapBuffer == nullptr) {
return stackBuffer;
} else {
return heapBuffer;
}
}
int32_t getCapacity() {
return capacity;
}
bool needToTryAgain(UErrorCode* err) {
if (heapBuffer != nullptr) {
return false;
}
if (*err == U_BUFFER_OVERFLOW_ERROR || *err == U_STRING_NOT_TERMINATED_WARNING) {
int32_t newCapacity = requestedCapacity + 2; // one for the terminating null, one just for paranoia
heapBuffer = static_cast<char*>(uprv_malloc(newCapacity));
if (heapBuffer == nullptr) {
*err = U_MEMORY_ALLOCATION_ERROR;
} else {
*err = U_ZERO_ERROR;
capacity = newCapacity;
}
return U_SUCCESS(*err);
}
return false;
}
};
#endif

View file

@ -649,6 +649,7 @@ private:
/** @internal (private) */
char actualLocale[ULOC_FULLNAME_CAPACITY];
char validLocale[ULOC_FULLNAME_CAPACITY];
char requestLocale[ULOC_FULLNAME_CAPACITY];
};
#ifndef U_HIDE_DEPRECATED_API

View file

@ -114,7 +114,7 @@
* </tr>
* <tr>
* <td>Locales </td>
* <td>uloc.h</a></td>
* <td>uloc.h, ulocale.h, ulocbuilder.h</a></td>
* <td>icu::Locale, icu::LocaleBuilder, icu::LocaleMatcher</td>
* </tr>
* <tr>

View file

@ -984,7 +984,10 @@ public:
static const char* const* U_EXPORT2 getISOCountries();
/**
* Gets a list of all available language codes defined in ISO 639. This is a pointer
* Returns a list of all unique language codes defined in ISO 639.
* They can be 2 or 3 letter codes, as defined by
* <a href="https://www.ietf.org/rfc/bcp/bcp47.html#section-2.2.1">
* BCP 47, section 2.2.1</a>. This is a pointer
* to an array of pointers to arrays of char. All of these pointers are owned
* by ICU-- do not delete them, and do not write through them. The array is
* terminated with a null pointer.
@ -1110,6 +1113,15 @@ protected: /* only protected for testing purposes. DO NOT USE. */
* @internal
*/
void setFromPOSIXID(const char *posixID);
/**
* Minimize the subtags for this Locale, per the algorithm described
* @param favorScript favor to keep script if true, to keep region if false.
* @param status error information if maximizing this Locale failed.
* If this Locale is not well-formed, the error code is
* U_ILLEGAL_ARGUMENT_ERROR.
* @internal
*/
void minimizeSubtags(bool favorScript, UErrorCode& status);
#endif /* U_HIDE_INTERNAL_API */
private:

View file

@ -147,7 +147,10 @@ public:
getNFKDInstance(UErrorCode &errorCode);
/**
* Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
* Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization
* which is equivalent to applying the NFKC_Casefold mappings and then NFC.
* See https://www.unicode.org/reports/tr44/#NFKC_Casefold
*
* Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode).
* Returns an unmodifiable singleton instance. Do not delete it.
* @param errorCode Standard ICU error code. Its input value must
@ -160,6 +163,25 @@ public:
static const Normalizer2 *
getNFKCCasefoldInstance(UErrorCode &errorCode);
#ifndef U_HIDE_DRAFT_API
/**
* Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization
* which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC.
* See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold
*
* Same as getInstance(nullptr, "nfkc_scf", UNORM2_COMPOSE, errorCode).
* Returns an unmodifiable singleton instance. Do not delete it.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested Normalizer2, if successful
* @draft ICU 74
*/
static const Normalizer2 *
getNFKCSimpleCasefoldInstance(UErrorCode &errorCode);
#endif // U_HIDE_DRAFT_API
/**
* Returns a Normalizer2 instance which uses the specified data file
* (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
@ -172,7 +194,7 @@ public:
* Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
*
* @param packageName nullptr for ICU built-in data, otherwise application data package name
* @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
* @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file
* @param mode normalization mode (compose or decompose etc.)
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns

View file

@ -43,6 +43,69 @@ class RBBIDataWrapper;
class UnhandledEngine;
class UStack;
#ifndef U_HIDE_DRAFT_API
/**
* The ExternalBreakEngine class define an abstract interface for the host environment
* to provide a low level facility to break text for unicode text in script that the text boundary
* cannot be handled by upper level rule based logic, for example, for Chinese and Japanese
* word breaking, Thai, Khmer, Burmese, Lao and other Southeast Asian scripts.
* The host environment implement one or more subclass of ExternalBreakEngine and
* register them in the initialization time by calling
* RuleBasedBreakIterator::registerExternalBreakEngine(). ICU adopt and own the engine and will
* delete the registered external engine in proper time during the clean up
* event.
* @internal ICU 74 technology preview
*/
class ExternalBreakEngine : public UObject {
public:
/**
* destructor
* @internal ICU 74 technology preview
*/
virtual ~ExternalBreakEngine() {}
/**
* <p>Indicate whether this engine handles a particular character when
* the RuleBasedBreakIterator is used for a particular locale. This method is used
* by the RuleBasedBreakIterator to find a break engine.</p>
* @param c A character which begins a run that the engine might handle.
* @param locale The locale.
* @return true if this engine handles the particular character for that locale.
* @internal ICU 74 technology preview
*/
virtual bool isFor(UChar32 c, const char* locale) const = 0;
/**
* <p>Indicate whether this engine handles a particular character.This method is
* used by the RuleBasedBreakIterator after it already find a break engine to see which
* characters after the first one can be handled by this break engine.</p>
* @param c A character that the engine might handle.
* @return true if this engine handles the particular character.
* @internal ICU 74 technology preview
*/
virtual bool handles(UChar32 c) const = 0;
/**
* <p>Divide up a range of text handled by this break engine.</p>
*
* @param text A UText representing the text
* @param start The start of the range of known characters
* @param end The end of the range of known characters
* @param foundBreaks Output of C array of int32_t break positions, or
* nullptr
* @param foundBreaksCapacity The capacity of foundBreaks
* @param status Information on any errors encountered.
* @return The number of breaks found
* @internal ICU 74 technology preview
*/
virtual int32_t fillBreaks(UText* text, int32_t start, int32_t end,
int32_t* foundBreaks, int32_t foundBreaksCapacity,
UErrorCode& status) const = 0;
};
#endif /* U_HIDE_DRAFT_API */
/**
*
* A subclass of BreakIterator whose behavior is specified using a list of rules.
@ -716,9 +779,10 @@ private:
* This function returns the appropriate LanguageBreakEngine for a
* given character c.
* @param c A character in the dictionary set
* @param locale The locale.
* @internal (private)
*/
const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c, const char* locale);
public:
#ifndef U_HIDE_INTERNAL_API
@ -734,8 +798,24 @@ private:
*/
void dumpTables();
#endif /* U_HIDE_INTERNAL_API */
#ifndef U_HIDE_DRAFT_API
/**
* Register a new external break engine. The external break engine will be adopted.
* Because ICU may choose to cache break engine internally, this must
* be called at application startup, prior to any calls to
* object methods of RuleBasedBreakIterator to avoid undefined behavior.
* @param toAdopt the ExternalBreakEngine instance to be adopted
* @param status the in/out status code, no special meanings are assigned
* @internal ICU 74 technology preview
*/
static void U_EXPORT2 registerExternalBreakEngine(
ExternalBreakEngine* toAdopt, UErrorCode& status);
#endif /* U_HIDE_DRAFT_API */
};
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View file

@ -60,7 +60,7 @@ U_CDECL_BEGIN
* @see u_getUnicodeVersion
* @stable ICU 2.0
*/
#define U_UNICODE_VERSION "15.0"
#define U_UNICODE_VERSION "15.1"
/**
* \file
@ -532,12 +532,33 @@ typedef enum UProperty {
* @stable ICU 70
*/
UCHAR_RGI_EMOJI=71,
#ifndef U_HIDE_DRAFT_API
/**
* Binary property IDS_Unary_Operator.
* For programmatic determination of Ideographic Description Sequences.
*
* @draft ICU 74
*/
UCHAR_IDS_UNARY_OPERATOR=72,
/**
* Binary property ID_Compat_Math_Start.
* Used in mathematical identifier profile in UAX #31.
* @draft ICU 74
*/
UCHAR_ID_COMPAT_MATH_START=73,
/**
* Binary property ID_Compat_Math_Continue.
* Used in mathematical identifier profile in UAX #31.
* @draft ICU 74
*/
UCHAR_ID_COMPAT_MATH_CONTINUE=74,
#endif // U_HIDE_DRAFT_API
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the last constant for binary Unicode properties.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
UCHAR_BINARY_LIMIT=72,
UCHAR_BINARY_LIMIT=75,
#endif // U_HIDE_DEPRECATED_API
/** Enumerated property Bidi_Class.
@ -1900,6 +1921,11 @@ enum UBlockCode {
/** @stable ICU 72 */
UBLOCK_NAG_MUNDARI = 327, /*[1E4D0]*/
// New block in Unicode 15.1
/** @stable ICU 74 */
UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I = 328, /*[2EBF0]*/
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the highest normal UBlockCode value.
@ -1907,7 +1933,7 @@ enum UBlockCode {
*
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
UBLOCK_COUNT = 328,
UBLOCK_COUNT = 329,
#endif // U_HIDE_DEPRECATED_API
/** @stable ICU 2.0 */
@ -2439,6 +2465,16 @@ typedef enum ULineBreak {
U_LB_E_MODIFIER = 41, /*[EM]*/
/** @stable ICU 58 */
U_LB_ZWJ = 42, /*[ZWJ]*/
/** @stable ICU 74 */
U_LB_AKSARA = 43, /*[AK]*/
/** @stable ICU 74 */
U_LB_AKSARA_PREBASE = 44, /*[AP]*/
/** @stable ICU 74 */
U_LB_AKSARA_START = 45, /*[AS]*/
/** @stable ICU 74 */
U_LB_VIRAMA_FINAL = 46, /*[VF]*/
/** @stable ICU 74 */
U_LB_VIRAMA = 47, /*[VI]*/
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the highest normal ULineBreak value.
@ -2446,7 +2482,7 @@ typedef enum ULineBreak {
*
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
U_LB_COUNT = 43
U_LB_COUNT = 48
#endif // U_HIDE_DEPRECATED_API
} ULineBreak;

View file

@ -0,0 +1,229 @@
// © 2023 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#ifndef ULOCALE_H
#define ULOCALE_H
#include "unicode/localpointer.h"
#include "unicode/uenum.h"
#include "unicode/utypes.h"
/**
* \file
* \brief C API: Locale ID functionality similar to C++ class Locale
*/
#ifndef U_HIDE_DRAFT_API
/**
* Opaque C service object type for the locale API
* @draft ICU 74
*/
struct ULocale;
/**
* C typedef for struct ULocale.
* @draft ICU 74
*/
typedef struct ULocale ULocale;
/**
* Constructs an ULocale from the locale ID.
* The created ULocale should be destroyed by calling
* ulocale_close();
* @param localeID the locale, a const char * pointer (need not be terminated when
* the length is non-negative)
* @param length the length of the locale; if negative, then the locale need to be
* null terminated.
* @param err the error code
* @return the locale.
*
* @draft ICU 74
*/
U_CAPI ULocale* U_EXPORT2
ulocale_openForLocaleID(const char* localeID, int32_t length, UErrorCode* err);
/**
* Constructs an ULocale from the provided IETF BCP 47 language tag.
* The created ULocale should be destroyed by calling
* ulocale_close();
* @param tag the language tag, defined as IETF BCP 47 language tag, const
* char* pointer (need not be terminated when the length is non-negative)
* @param length the length of the tag; if negative, then the tag need to be
* null terminated.
* @param err the error code
* @return the locale.
*
* @draft ICU 74
*/
U_CAPI ULocale* U_EXPORT2
ulocale_openForLanguageTag(const char* tag, int32_t length, UErrorCode* err);
/**
* Close the locale and destroy it's internal states.
*
* @param locale the locale
* @draft ICU 74
*/
U_CAPI void U_EXPORT2
ulocale_close(ULocale* locale);
/**
* Returns the locale's ISO-639 language code.
*
* @param locale the locale
* @return the language code of the locale.
* @draft ICU 74
*/
U_CAPI const char* U_EXPORT2
ulocale_getLanguage(const ULocale* locale);
/**
* Returns the locale's ISO-15924 abbreviation script code.
*
* @param locale the locale
* @return A pointer to the script.
* @draft ICU 74
*/
U_CAPI const char* U_EXPORT2
ulocale_getScript(const ULocale* locale);
/**
* Returns the locale's ISO-3166 region code.
*
* @param locale the locale
* @return A pointer to the region.
* @draft ICU 74
*/
U_CAPI const char* U_EXPORT2
ulocale_getRegion(const ULocale* locale);
/**
* Returns the locale's variant code.
*
* @param locale the locale
* @return A pointer to the variant.
* @draft ICU 74
*/
U_CAPI const char* U_EXPORT2
ulocale_getVariant(const ULocale* locale);
/**
* Returns the programmatic name of the entire locale, with the language,
* country and variant separated by underbars. If a field is missing, up
* to two leading underbars will occur. Example: "en", "de_DE", "en_US_WIN",
* "de__POSIX", "fr__MAC", "__MAC", "_MT", "_FR_EURO"
*
* @param locale the locale
* @return A pointer to "name".
* @draft ICU 74
*/
U_CAPI const char* U_EXPORT2
ulocale_getLocaleID(const ULocale* locale);
/**
* Returns the programmatic name of the entire locale as ulocale_getLocaleID()
* would return, but without keywords.
*
* @param locale the locale
* @return A pointer to "base name".
* @draft ICU 74
*/
U_CAPI const char* U_EXPORT2
ulocale_getBaseName(const ULocale* locale);
/**
* Gets the bogus state. Locale object can be bogus if it doesn't exist
*
* @param locale the locale
* @return false if it is a real locale, true if it is a bogus locale
* @draft ICU 74
*/
U_CAPI bool U_EXPORT2
ulocale_isBogus(const ULocale* locale);
/**
* Gets the list of keywords for the specified locale.
*
* @param locale the locale
* @param err the error code
* @return pointer to UEnumeration, or nullptr if there are no keywords.
* Client must call uenum_close() to dispose the returned value.
* @draft ICU 74
*/
U_CAPI UEnumeration* U_EXPORT2
ulocale_getKeywords(const ULocale* locale, UErrorCode *err);
/**
* Gets the list of unicode keywords for the specified locale.
*
* @param locale the locale
* @param err the error code
* @return pointer to UEnumeration, or nullptr if there are no keywords.
* Client must call uenum_close() to dispose the returned value.
* @draft ICU 74
*/
U_CAPI UEnumeration* U_EXPORT2
ulocale_getUnicodeKeywords(const ULocale* locale, UErrorCode *err);
/**
* Gets the value for a keyword.
*
* This uses legacy keyword=value pairs, like "collation=phonebook".
*
* @param locale the locale
* @param keyword the keyword, a const char * pointer (need not be
* terminated when the length is non-negative)
* @param keywordLength the length of the keyword; if negative, then the
* keyword need to be null terminated.
* @param valueBuffer The buffer to receive the value.
* @param valueBufferCapacity The capacity of receiving valueBuffer.
* @param err the error code
* @draft ICU 74
*/
U_CAPI int32_t U_EXPORT2
ulocale_getKeywordValue(
const ULocale* locale, const char* keyword, int32_t keywordLength,
char* valueBuffer, int32_t valueBufferCapacity, UErrorCode *err);
/**
* Gets the Unicode value for a Unicode keyword.
*
* This uses Unicode key-value pairs, like "co-phonebk".
*
* @param locale the locale
* @param keyword the Unicode keyword, a const char * pointer (need not be
* terminated when the length is non-negative)
* @param keywordLength the length of the Unicode keyword; if negative,
* then the keyword need to be null terminated.
* @param valueBuffer The buffer to receive the Unicode value.
* @param valueBufferCapacity The capacity of receiving valueBuffer.
* @param err the error code
* @draft ICU 74
*/
U_CAPI int32_t U_EXPORT2
ulocale_getUnicodeKeywordValue(
const ULocale* locale, const char* keyword, int32_t keywordLength,
char* valueBuffer, int32_t valueBufferCapacity, UErrorCode *err);
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalULocalePointer
* "Smart pointer" class, closes a ULocale via ulocale_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @draft ICU 74
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalULocalePointer, ULocale, ulocale_close);
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif /* U_HIDE_DRAFT_API */
#endif /*_ULOCALE */

View file

@ -0,0 +1,441 @@
// © 2023 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#ifndef __ULOCBUILDER_H__
#define __ULOCBUILDER_H__
#include "unicode/localpointer.h"
#include "unicode/ulocale.h"
#include "unicode/utypes.h"
/**
* \file
* \brief C API: Builder API for Locale
*/
#ifndef U_HIDE_DRAFT_API
/**
* Opaque C service object type for the locale builder API
* @draft ICU 74
*/
struct ULocaleBuilder;
/**
* C typedef for struct ULocaleBuilder.
* @draft ICU 74
*/
typedef struct ULocaleBuilder ULocaleBuilder;
/**
* <code>ULocaleBuilder</code> is used to build valid <code>locale</code> id
* string or IETF BCP 47 language tag from values configured by the setters.
* The <code>ULocaleBuilder</code> checks if a value configured by a
* setter satisfies the syntax requirements defined by the <code>Locale</code>
* class. A string of Locale created by a <code>ULocaleBuilder</code> is
* well-formed and can be transformed to a well-formed IETF BCP 47 language tag
* without losing information.
*
* <p>The following example shows how to create a <code>locale</code> string
* with the <code>ULocaleBuilder</code>.
* <blockquote>
* <pre>
* UErrorCode err = U_ZERO_ERROR;
* char buffer[ULOC_FULLNAME_CAPACITY];
* ULocaleBuilder* builder = ulocbld_open();
* ulocbld_setLanguage(builder, "sr", -1);
* ulocbld_setScript(builder, "Latn", -1);
* ulocbld_setRegion(builder, "RS", -1);
* int32_t length = ulocbld_buildLocaleID(
* builder, buffer, ULOC_FULLNAME_CAPACITY, &error);
* ulocbld_close(builder);
* </pre>
* </blockquote>
*
* <p>ULocaleBuilders can be reused; <code>ulocbld_clear()</code> resets all
* fields to their default values.
*
* <p>ULocaleBuilder tracks errors in an internal UErrorCode. For all setters,
* except ulocbld_setLanguageTag and ulocbld_setLocale, ULocaleBuilder will return immediately
* if the internal UErrorCode is in error state.
* To reset internal state and error code, call clear method.
* The ulocbld_setLanguageTag and setLocale method will first clear the internal
* UErrorCode, then track the error of the validation of the input parameter
* into the internal UErrorCode.
*
* @draft ICU 74
*/
/**
* Constructs an empty ULocaleBuilder. The default value of all
* fields, extensions, and private use information is the
* empty string. The created builder should be destroyed by calling
* ulocbld_close();
*
* @draft ICU 74
*/
U_CAPI ULocaleBuilder* U_EXPORT2
ulocbld_open();
/**
* Close the builder and destroy it's internal states.
* @param builder the builder
* @draft ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_close(ULocaleBuilder* builder);
/**
* Resets the <code>ULocaleBuilder</code> to match the provided
* <code>locale</code>. Existing state is discarded.
*
* <p>All fields of the locale must be well-formed.
* <p>This method clears the internal UErrorCode.
*
* @param builder the builder
* @param locale the locale, a const char * pointer (need not be terminated when
* the length is non-negative)
* @param length the length of the locale; if negative, then the locale need to be
* null terminated,
*
* @draft ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_setLocale(ULocaleBuilder* builder, const char* locale, int32_t length);
/**
* Resets the <code>ULocaleBuilder</code> to match the provided
* <code>ULocale</code>. Existing state is discarded.
*
* <p>The locale must be not bogus.
* <p>This method clears the internal UErrorCode.
*
* @param builder the builder.
* @param locale the locale, a ULocale* pointer. The builder adopts the locale
* after the call and the client must not delete it.
*
* @draft ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_adoptULocale(ULocaleBuilder* builder, ULocale* locale);
/**
* Resets the ULocaleBuilder to match the provided IETF BCP 47 language tag.
* Discards the existing state.
* The empty string causes the builder to be reset, like {@link #ulocbld_clear}.
* Legacy language tags (marked as Type: grandfathered in BCP 47)
* are converted to their canonical form before being processed.
* Otherwise, the <code>language tag</code> must be well-formed,
* or else the ulocbld_buildLocaleID() and ulocbld_buildLanguageTag() methods
* will later report an U_ILLEGAL_ARGUMENT_ERROR.
*
* <p>This method clears the internal UErrorCode.
*
* @param builder the builder
* @param tag the language tag, defined as IETF BCP 47 language tag, a
* const char * pointer (need not be terminated when
* the length is non-negative)
* @param length the length of the tag; if negative, then the tag need to be
* null terminated,
* @draft ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_setLanguageTag(ULocaleBuilder* builder, const char* tag, int32_t length);
/**
* Sets the language. If <code>language</code> is the empty string, the
* language in this <code>ULocaleBuilder</code> is removed. Otherwise, the
* <code>language</code> must be well-formed, or else the ulocbld_buildLocaleID()
* and ulocbld_buildLanguageTag() methods will
* later report an U_ILLEGAL_ARGUMENT_ERROR.
*
* <p>The syntax of language value is defined as
* [unicode_language_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_language_subtag).
*
* @param builder the builder
* @param language the language, a const char * pointer (need not be terminated when
* the length is non-negative)
* @param length the length of the language; if negative, then the language need to be
* null terminated,
* @draft ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_setLanguage(ULocaleBuilder* builder, const char* language, int32_t length);
/**
* Sets the script. If <code>script</code> is the empty string, the script in
* this <code>ULocaleBuilder</code> is removed.
* Otherwise, the <code>script</code> must be well-formed, or else the
* ulocbld_buildLocaleID() and ulocbld_buildLanguageTag() methods will later
* report an U_ILLEGAL_ARGUMENT_ERROR.
*
* <p>The script value is a four-letter script code as
* [unicode_script_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_script_subtag)
* defined by ISO 15924
*
* @param builder the builder
* @param script the script, a const char * pointer (need not be terminated when
* the length is non-negative)
* @param length the length of the script; if negative, then the script need to be
* null terminated,
* @draft ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_setScript(ULocaleBuilder* builder, const char* script, int32_t length);
/**
* Sets the region. If region is the empty string, the region in this
* <code>ULocaleBuilder</code> is removed. Otherwise, the <code>region</code>
* must be well-formed, or else the ulocbld_buildLocaleID() and
* ulocbld_buildLanguageTag() methods will later report an
* U_ILLEGAL_ARGUMENT_ERROR.
*
* <p>The region value is defined by
* [unicode_region_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_region_subtag)
* as a two-letter ISO 3166 code or a three-digit UN M.49 area code.
*
* <p>The region value in the <code>Locale</code> created by the
* <code>ULocaleBuilder</code> is always normalized to upper case.
*
* @param builder the builder
* @param region the region, a const char * pointer (need not be terminated when
* the length is non-negative)
* @param length the length of the region; if negative, then the region need to be
* null terminated,
* @draft ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_setRegion(ULocaleBuilder* builder, const char* region, int32_t length);
/**
* Sets the variant. If variant is the empty string, the variant in this
* <code>ULocaleBuilder</code> is removed. Otherwise, the <code>variant</code>
* must be well-formed, or else the ulocbld_buildLocaleID() and
* ulocbld_buildLanguageTag() methods will later report an
* U_ILLEGAL_ARGUMENT_ERROR.
*
* <p><b>Note:</b> This method checks if <code>variant</code>
* satisfies the
* [unicode_variant_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_variant_subtag)
* syntax requirements, and normalizes the value to lowercase letters. However,
* the <code>Locale</code> class does not impose any syntactic
* restriction on variant. To set an ill-formed variant, use a Locale constructor.
* If there are multiple unicode_variant_subtag, the caller must concatenate
* them with '-' as separator (ex: "foobar-fibar").
*
* @param builder the builder
* @param variant the variant, a const char * pointer (need not be terminated when
* the length is non-negative)
* @param length the length of the variant; if negative, then the variant need to be
* null terminated,
* @draft ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_setVariant(ULocaleBuilder* builder, const char* variant, int32_t length);
/**
* Sets the extension for the given key. If the value is the empty string,
* the extension is removed. Otherwise, the <code>key</code> and
* <code>value</code> must be well-formed, or else the ulocbld_buildLocaleID()
* and ulocbld_buildLanguageTag() methods will
* later report an U_ILLEGAL_ARGUMENT_ERROR.
*
* <p><b>Note:</b> The key ('u') is used for the Unicode locale extension.
* Setting a value for this key replaces any existing Unicode locale key/type
* pairs with those defined in the extension.
*
* <p><b>Note:</b> The key ('x') is used for the private use code. To be
* well-formed, the value for this key needs only to have subtags of one to
* eight alphanumeric characters, not two to eight as in the general case.
*
* @param builder the builder
* @param key the extension key
* @param value the value, a const char * pointer (need not be terminated when
* the length is non-negative)
* @param length the length of the value; if negative, then the value need to be
* null terminated,
* @draft ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_setExtension(ULocaleBuilder* builder, char key, const char* value, int32_t length);
/**
* Sets the Unicode locale keyword type for the given key. If the type
* StringPiece is constructed with a nullptr, the keyword is removed.
* If the type is the empty string, the keyword is set without type subtags.
* Otherwise, the key and type must be well-formed, or else the
* ulocbld_buildLocaleID() and ulocbld_buildLanguageTag() methods will later
* report an U_ILLEGAL_ARGUMENT_ERROR.
*
* <p>Keys and types are converted to lower case.
*
* <p><b>Note</b>:Setting the 'u' extension via {@link #ulocbld_setExtension}
* replaces all Unicode locale keywords with those defined in the
* extension.
*
* @param builder the builder
* @param key the Unicode locale key, a const char * pointer (need not be
* terminated when the length is non-negative)
* @param keyLength the length of the key; if negative, then the key need to be
* null terminated,
* @param type the Unicode locale type, a const char * pointer (need not be
* terminated when the length is non-negative)
* @param typeLength the length of the type; if negative, then the type need to
* be null terminated,
* @return This builder.
* @draft ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_setUnicodeLocaleKeyword(ULocaleBuilder* builder,
const char* key, int32_t keyLength, const char* type, int32_t typeLength);
/**
* Adds a unicode locale attribute, if not already present, otherwise
* has no effect. The attribute must not be empty string and must be
* well-formed or U_ILLEGAL_ARGUMENT_ERROR will be set to status
* during the ulocbld_buildLocaleID() and ulocbld_buildLanguageTag() calls.
*
* @param builder the builder
* @param attribute the attribute, a const char * pointer (need not be
* terminated when the length is non-negative)
* @param length the length of the attribute; if negative, then the attribute
* need to be null terminated,
* @draft ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_addUnicodeLocaleAttribute(
ULocaleBuilder* builder, const char* attribute, int32_t length);
/**
* Removes a unicode locale attribute, if present, otherwise has no
* effect. The attribute must not be empty string and must be well-formed
* or U_ILLEGAL_ARGUMENT_ERROR will be set to status during the ulocbld_buildLocaleID()
* and ulocbld_buildLanguageTag() calls.
*
* <p>Attribute comparison for removal is case-insensitive.
*
* @param builder the builder
* @param attribute the attribute, a const char * pointer (need not be
* terminated when the length is non-negative)
* @param length the length of the attribute; if negative, then the attribute
* need to be null terminated,
* @draft ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_removeUnicodeLocaleAttribute(
ULocaleBuilder* builder, const char* attribute, int32_t length);
/**
* Resets the builder to its initial, empty state.
* <p>This method clears the internal UErrorCode.
*
* @param builder the builder
* @draft ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_clear(ULocaleBuilder* builder);
/**
* Resets the extensions to their initial, empty state.
* Language, script, region and variant are unchanged.
*
* @param builder the builder
* @draft ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_clearExtensions(ULocaleBuilder* builder);
/**
* Build the LocaleID string from the fields set on this builder.
* If any set methods or during the ulocbld_buildLocaleID() call require memory
* allocation but fail U_MEMORY_ALLOCATION_ERROR will be set to status.
* If any of the fields set by the setters are not well-formed, the status
* will be set to U_ILLEGAL_ARGUMENT_ERROR. The state of the builder will
* not change after the ulocbld_buildLocaleID() call and the caller is
* free to keep using the same builder to build more locales.
*
* @param builder the builder
* @param locale the locale id
* @param localeCapacity the size of the locale buffer to store the locale id
* @param err the error code
* @return the length of the locale id in buffer
* @draft ICU 74
*/
U_CAPI int32_t U_EXPORT2
ulocbld_buildLocaleID(ULocaleBuilder* builder, char* locale,
int32_t localeCapacity, UErrorCode* err);
/**
* Build the ULocale object from the fields set on this builder.
* If any set methods or during the ulocbld_buildULocale() call require memory
* allocation but fail U_MEMORY_ALLOCATION_ERROR will be set to status.
* If any of the fields set by the setters are not well-formed, the status
* will be set to U_ILLEGAL_ARGUMENT_ERROR. The state of the builder will
* not change after the ulocbld_buildULocale() call and the caller is
* free to keep using the same builder to build more locales.
*
* @param builder the builder.
* @param err the error code.
* @return the locale, a ULocale* pointer. The created ULocale must be
* destroyed by calling {@link ulocale_close}.
* @draft ICU 74
*/
U_CAPI ULocale* U_EXPORT2
ulocbld_buildULocale(ULocaleBuilder* builder, UErrorCode* err);
/**
* Build the IETF BCP 47 language tag string from the fields set on this builder.
* If any set methods or during the ulocbld_buildLanguageTag() call require memory
* allocation but fail U_MEMORY_ALLOCATION_ERROR will be set to status.
* If any of the fields set by the setters are not well-formed, the status
* will be set to U_ILLEGAL_ARGUMENT_ERROR. The state of the builder will
* not change after the ulocbld_buildLanguageTag() call and the caller is free
* to keep using the same builder to build more locales.
*
* @param builder the builder
* @param language the language tag
* @param languageCapacity the size of the language buffer to store the language
* tag
* @param err the error code
* @return the length of the language tag in buffer
* @draft ICU 74
*/
U_CAPI int32_t U_EXPORT2
ulocbld_buildLanguageTag(ULocaleBuilder* builder, char* language,
int32_t languageCapacity, UErrorCode* err);
/**
* Sets the UErrorCode if an error occurred while recording sets.
* Preserves older error codes in the outErrorCode.
*
* @param builder the builder
* @param outErrorCode Set to an error code that occurred while setting subtags.
* Unchanged if there is no such error or if outErrorCode
* already contained an error.
* @return true if U_FAILURE(*outErrorCode)
* @draft ICU 74
*/
U_CAPI UBool U_EXPORT2
ulocbld_copyErrorTo(const ULocaleBuilder* builder, UErrorCode *outErrorCode);
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalULocaleBuilderPointer
* "Smart pointer" class, closes a ULocaleBuilder via ulocbld_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @draft ICU 74
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalULocaleBuilderPointer, ULocaleBuilder, ulocbld_close);
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif /* U_HIDE_DRAFT_API */
#endif // __ULOCBUILDER_H__

View file

@ -181,7 +181,10 @@ U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFKDInstance(UErrorCode *pErrorCode);
/**
* Returns a UNormalizer2 instance for Unicode NFKC_Casefold normalization.
* Returns a UNormalizer2 instance for Unicode toNFKC_Casefold() normalization
* which is equivalent to applying the NFKC_Casefold mappings and then NFC.
* See https://www.unicode.org/reports/tr44/#NFKC_Casefold
*
* Same as unorm2_getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, pErrorCode).
* Returns an unmodifiable singleton instance. Do not delete it.
* @param pErrorCode Standard ICU error code. Its input value must
@ -194,6 +197,25 @@ unorm2_getNFKDInstance(UErrorCode *pErrorCode);
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode);
#ifndef U_HIDE_DRAFT_API
/**
* Returns a UNormalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization
* which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC.
* See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold
*
* Same as unorm2_getInstance(NULL, "nfkc_scf", UNORM2_COMPOSE, pErrorCode).
* Returns an unmodifiable singleton instance. Do not delete it.
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested Normalizer2, if successful
* @draft ICU 74
*/
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFKCSimpleCasefoldInstance(UErrorCode *pErrorCode);
#endif // U_HIDE_DRAFT_API
/**
* Returns a UNormalizer2 instance which uses the specified data file
* (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
@ -206,7 +228,7 @@ unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode);
* Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
*
* @param packageName NULL for ICU built-in data, otherwise application data package name
* @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
* @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file
* @param mode normalization mode (compose or decompose etc.)
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns

View file

@ -138,8 +138,8 @@
#define locale_getKeywordsStart U_ICU_ENTRY_POINT_RENAME(locale_getKeywordsStart)
#define locale_get_default U_ICU_ENTRY_POINT_RENAME(locale_get_default)
#define locale_set_default U_ICU_ENTRY_POINT_RENAME(locale_set_default)
#define mixedMeasuresToMicros U_ICU_ENTRY_POINT_RENAME(mixedMeasuresToMicros)
#define numSysCleanup U_ICU_ENTRY_POINT_RENAME(numSysCleanup)
#define rbbi_cleanup U_ICU_ENTRY_POINT_RENAME(rbbi_cleanup)
#define pl_addFontRun U_ICU_ENTRY_POINT_RENAME(pl_addFontRun)
#define pl_addLocaleRun U_ICU_ENTRY_POINT_RENAME(pl_addLocaleRun)
#define pl_addValueRun U_ICU_ENTRY_POINT_RENAME(pl_addValueRun)
@ -193,6 +193,7 @@
#define pl_resetFontRuns U_ICU_ENTRY_POINT_RENAME(pl_resetFontRuns)
#define pl_resetLocaleRuns U_ICU_ENTRY_POINT_RENAME(pl_resetLocaleRuns)
#define pl_resetValueRuns U_ICU_ENTRY_POINT_RENAME(pl_resetValueRuns)
#define rbbi_cleanup U_ICU_ENTRY_POINT_RENAME(rbbi_cleanup)
#define res_countArrayItems U_ICU_ENTRY_POINT_RENAME(res_countArrayItems)
#define res_findResource U_ICU_ENTRY_POINT_RENAME(res_findResource)
#define res_getAlias U_ICU_ENTRY_POINT_RENAME(res_getAlias)
@ -512,9 +513,6 @@
#define ubrk_setText U_ICU_ENTRY_POINT_RENAME(ubrk_setText)
#define ubrk_setUText U_ICU_ENTRY_POINT_RENAME(ubrk_setUText)
#define ubrk_swap U_ICU_ENTRY_POINT_RENAME(ubrk_swap)
#define ucache_compareKeys U_ICU_ENTRY_POINT_RENAME(ucache_compareKeys)
#define ucache_deleteKey U_ICU_ENTRY_POINT_RENAME(ucache_deleteKey)
#define ucache_hashKeys U_ICU_ENTRY_POINT_RENAME(ucache_hashKeys)
#define ucal_add U_ICU_ENTRY_POINT_RENAME(ucal_add)
#define ucal_clear U_ICU_ENTRY_POINT_RENAME(ucal_clear)
#define ucal_clearField U_ICU_ENTRY_POINT_RENAME(ucal_clearField)
@ -532,6 +530,7 @@
#define ucal_getFieldDifference U_ICU_ENTRY_POINT_RENAME(ucal_getFieldDifference)
#define ucal_getGregorianChange U_ICU_ENTRY_POINT_RENAME(ucal_getGregorianChange)
#define ucal_getHostTimeZone U_ICU_ENTRY_POINT_RENAME(ucal_getHostTimeZone)
#define ucal_getIanaTimeZoneID U_ICU_ENTRY_POINT_RENAME(ucal_getIanaTimeZoneID)
#define ucal_getKeywordValuesForLocale U_ICU_ENTRY_POINT_RENAME(ucal_getKeywordValuesForLocale)
#define ucal_getLimit U_ICU_ENTRY_POINT_RENAME(ucal_getLimit)
#define ucal_getLocaleByType U_ICU_ENTRY_POINT_RENAME(ucal_getLocaleByType)
@ -587,6 +586,7 @@
#define ucasemap_getLocale U_ICU_ENTRY_POINT_RENAME(ucasemap_getLocale)
#define ucasemap_getOptions U_ICU_ENTRY_POINT_RENAME(ucasemap_getOptions)
#define ucasemap_internalUTF8ToTitle U_ICU_ENTRY_POINT_RENAME(ucasemap_internalUTF8ToTitle)
#define ucasemap_mapUTF8 U_ICU_ENTRY_POINT_RENAME(ucasemap_mapUTF8)
#define ucasemap_open U_ICU_ENTRY_POINT_RENAME(ucasemap_open)
#define ucasemap_setBreakIterator U_ICU_ENTRY_POINT_RENAME(ucasemap_setBreakIterator)
#define ucasemap_setLocale U_ICU_ENTRY_POINT_RENAME(ucasemap_setLocale)
@ -955,9 +955,16 @@
#define ufieldpositer_close U_ICU_ENTRY_POINT_RENAME(ufieldpositer_close)
#define ufieldpositer_next U_ICU_ENTRY_POINT_RENAME(ufieldpositer_next)
#define ufieldpositer_open U_ICU_ENTRY_POINT_RENAME(ufieldpositer_open)
#define ufile_close_translit U_ICU_ENTRY_POINT_RENAME(ufile_close_translit)
#define ufile_fill_uchar_buffer U_ICU_ENTRY_POINT_RENAME(ufile_fill_uchar_buffer)
#define ufile_flush_io U_ICU_ENTRY_POINT_RENAME(ufile_flush_io)
#define ufile_flush_translit U_ICU_ENTRY_POINT_RENAME(ufile_flush_translit)
#define ufile_getch U_ICU_ENTRY_POINT_RENAME(ufile_getch)
#define ufile_getch32 U_ICU_ENTRY_POINT_RENAME(ufile_getch32)
#define ufmt_64tou U_ICU_ENTRY_POINT_RENAME(ufmt_64tou)
#define ufmt_close U_ICU_ENTRY_POINT_RENAME(ufmt_close)
#define ufmt_defaultCPToUnicode U_ICU_ENTRY_POINT_RENAME(ufmt_defaultCPToUnicode)
#define ufmt_digitvalue U_ICU_ENTRY_POINT_RENAME(ufmt_digitvalue)
#define ufmt_getArrayItemByIndex U_ICU_ENTRY_POINT_RENAME(ufmt_getArrayItemByIndex)
#define ufmt_getArrayLength U_ICU_ENTRY_POINT_RENAME(ufmt_getArrayLength)
#define ufmt_getDate U_ICU_ENTRY_POINT_RENAME(ufmt_getDate)
@ -969,7 +976,11 @@
#define ufmt_getType U_ICU_ENTRY_POINT_RENAME(ufmt_getType)
#define ufmt_getUChars U_ICU_ENTRY_POINT_RENAME(ufmt_getUChars)
#define ufmt_isNumeric U_ICU_ENTRY_POINT_RENAME(ufmt_isNumeric)
#define ufmt_isdigit U_ICU_ENTRY_POINT_RENAME(ufmt_isdigit)
#define ufmt_open U_ICU_ENTRY_POINT_RENAME(ufmt_open)
#define ufmt_ptou U_ICU_ENTRY_POINT_RENAME(ufmt_ptou)
#define ufmt_uto64 U_ICU_ENTRY_POINT_RENAME(ufmt_uto64)
#define ufmt_utop U_ICU_ENTRY_POINT_RENAME(ufmt_utop)
#define ufmtval_getString U_ICU_ENTRY_POINT_RENAME(ufmtval_getString)
#define ufmtval_nextPosition U_ICU_ENTRY_POINT_RENAME(ufmtval_nextPosition)
#define ugender_getInstance U_ICU_ENTRY_POINT_RENAME(ugender_getInstance)
@ -1133,6 +1144,39 @@
#define uloc_toLegacyType U_ICU_ENTRY_POINT_RENAME(uloc_toLegacyType)
#define uloc_toUnicodeLocaleKey U_ICU_ENTRY_POINT_RENAME(uloc_toUnicodeLocaleKey)
#define uloc_toUnicodeLocaleType U_ICU_ENTRY_POINT_RENAME(uloc_toUnicodeLocaleType)
#define ulocale_close U_ICU_ENTRY_POINT_RENAME(ulocale_close)
#define ulocale_getBaseName U_ICU_ENTRY_POINT_RENAME(ulocale_getBaseName)
#define ulocale_getKeywordValue U_ICU_ENTRY_POINT_RENAME(ulocale_getKeywordValue)
#define ulocale_getKeywords U_ICU_ENTRY_POINT_RENAME(ulocale_getKeywords)
#define ulocale_getLanguage U_ICU_ENTRY_POINT_RENAME(ulocale_getLanguage)
#define ulocale_getLocaleID U_ICU_ENTRY_POINT_RENAME(ulocale_getLocaleID)
#define ulocale_getRegion U_ICU_ENTRY_POINT_RENAME(ulocale_getRegion)
#define ulocale_getScript U_ICU_ENTRY_POINT_RENAME(ulocale_getScript)
#define ulocale_getUnicodeKeywordValue U_ICU_ENTRY_POINT_RENAME(ulocale_getUnicodeKeywordValue)
#define ulocale_getUnicodeKeywords U_ICU_ENTRY_POINT_RENAME(ulocale_getUnicodeKeywords)
#define ulocale_getVariant U_ICU_ENTRY_POINT_RENAME(ulocale_getVariant)
#define ulocale_isBogus U_ICU_ENTRY_POINT_RENAME(ulocale_isBogus)
#define ulocale_openForLanguageTag U_ICU_ENTRY_POINT_RENAME(ulocale_openForLanguageTag)
#define ulocale_openForLocaleID U_ICU_ENTRY_POINT_RENAME(ulocale_openForLocaleID)
#define ulocbld_addUnicodeLocaleAttribute U_ICU_ENTRY_POINT_RENAME(ulocbld_addUnicodeLocaleAttribute)
#define ulocbld_adoptULocale U_ICU_ENTRY_POINT_RENAME(ulocbld_adoptULocale)
#define ulocbld_buildLanguageTag U_ICU_ENTRY_POINT_RENAME(ulocbld_buildLanguageTag)
#define ulocbld_buildLocaleID U_ICU_ENTRY_POINT_RENAME(ulocbld_buildLocaleID)
#define ulocbld_buildULocale U_ICU_ENTRY_POINT_RENAME(ulocbld_buildULocale)
#define ulocbld_clear U_ICU_ENTRY_POINT_RENAME(ulocbld_clear)
#define ulocbld_clearExtensions U_ICU_ENTRY_POINT_RENAME(ulocbld_clearExtensions)
#define ulocbld_close U_ICU_ENTRY_POINT_RENAME(ulocbld_close)
#define ulocbld_copyErrorTo U_ICU_ENTRY_POINT_RENAME(ulocbld_copyErrorTo)
#define ulocbld_open U_ICU_ENTRY_POINT_RENAME(ulocbld_open)
#define ulocbld_removeUnicodeLocaleAttribute U_ICU_ENTRY_POINT_RENAME(ulocbld_removeUnicodeLocaleAttribute)
#define ulocbld_setExtension U_ICU_ENTRY_POINT_RENAME(ulocbld_setExtension)
#define ulocbld_setLanguage U_ICU_ENTRY_POINT_RENAME(ulocbld_setLanguage)
#define ulocbld_setLanguageTag U_ICU_ENTRY_POINT_RENAME(ulocbld_setLanguageTag)
#define ulocbld_setLocale U_ICU_ENTRY_POINT_RENAME(ulocbld_setLocale)
#define ulocbld_setRegion U_ICU_ENTRY_POINT_RENAME(ulocbld_setRegion)
#define ulocbld_setScript U_ICU_ENTRY_POINT_RENAME(ulocbld_setScript)
#define ulocbld_setUnicodeLocaleKeyword U_ICU_ENTRY_POINT_RENAME(ulocbld_setUnicodeLocaleKeyword)
#define ulocbld_setVariant U_ICU_ENTRY_POINT_RENAME(ulocbld_setVariant)
#define ulocdata_close U_ICU_ENTRY_POINT_RENAME(ulocdata_close)
#define ulocdata_getCLDRVersion U_ICU_ENTRY_POINT_RENAME(ulocdata_getCLDRVersion)
#define ulocdata_getDelimiter U_ICU_ENTRY_POINT_RENAME(ulocdata_getDelimiter)
@ -1213,6 +1257,7 @@
#define unorm2_getNFDInstance U_ICU_ENTRY_POINT_RENAME(unorm2_getNFDInstance)
#define unorm2_getNFKCCasefoldInstance U_ICU_ENTRY_POINT_RENAME(unorm2_getNFKCCasefoldInstance)
#define unorm2_getNFKCInstance U_ICU_ENTRY_POINT_RENAME(unorm2_getNFKCInstance)
#define unorm2_getNFKCSimpleCasefoldInstance U_ICU_ENTRY_POINT_RENAME(unorm2_getNFKCSimpleCasefoldInstance)
#define unorm2_getNFKDInstance U_ICU_ENTRY_POINT_RENAME(unorm2_getNFKDInstance)
#define unorm2_getRawDecomposition U_ICU_ENTRY_POINT_RENAME(unorm2_getRawDecomposition)
#define unorm2_hasBoundaryAfter U_ICU_ENTRY_POINT_RENAME(unorm2_hasBoundaryAfter)
@ -1349,6 +1394,7 @@
#define uprv_convertToPosix U_ICU_ENTRY_POINT_RENAME(uprv_convertToPosix)
#define uprv_copyAscii U_ICU_ENTRY_POINT_RENAME(uprv_copyAscii)
#define uprv_copyEbcdic U_ICU_ENTRY_POINT_RENAME(uprv_copyEbcdic)
#define uprv_currencyLeads U_ICU_ENTRY_POINT_RENAME(uprv_currencyLeads)
#define uprv_decContextClearStatus U_ICU_ENTRY_POINT_RENAME(uprv_decContextClearStatus)
#define uprv_decContextDefault U_ICU_ENTRY_POINT_RENAME(uprv_decContextDefault)
#define uprv_decContextGetRounding U_ICU_ENTRY_POINT_RENAME(uprv_decContextGetRounding)
@ -1367,6 +1413,7 @@
#define uprv_decNumberAbs U_ICU_ENTRY_POINT_RENAME(uprv_decNumberAbs)
#define uprv_decNumberAdd U_ICU_ENTRY_POINT_RENAME(uprv_decNumberAdd)
#define uprv_decNumberAnd U_ICU_ENTRY_POINT_RENAME(uprv_decNumberAnd)
#define uprv_decNumberClass U_ICU_ENTRY_POINT_RENAME(uprv_decNumberClass)
#define uprv_decNumberClassToString U_ICU_ENTRY_POINT_RENAME(uprv_decNumberClassToString)
#define uprv_decNumberCompare U_ICU_ENTRY_POINT_RENAME(uprv_decNumberCompare)
#define uprv_decNumberCompareSignal U_ICU_ENTRY_POINT_RENAME(uprv_decNumberCompareSignal)
@ -1763,6 +1810,9 @@
#define usnumf_formatInt64 U_ICU_ENTRY_POINT_RENAME(usnumf_formatInt64)
#define usnumf_openForLocale U_ICU_ENTRY_POINT_RENAME(usnumf_openForLocale)
#define usnumf_openForLocaleAndGroupingStrategy U_ICU_ENTRY_POINT_RENAME(usnumf_openForLocaleAndGroupingStrategy)
#define uspoof_areBidiConfusable U_ICU_ENTRY_POINT_RENAME(uspoof_areBidiConfusable)
#define uspoof_areBidiConfusableUTF8 U_ICU_ENTRY_POINT_RENAME(uspoof_areBidiConfusableUTF8)
#define uspoof_areBidiConfusableUnicodeString U_ICU_ENTRY_POINT_RENAME(uspoof_areBidiConfusableUnicodeString)
#define uspoof_areConfusable U_ICU_ENTRY_POINT_RENAME(uspoof_areConfusable)
#define uspoof_areConfusableUTF8 U_ICU_ENTRY_POINT_RENAME(uspoof_areConfusableUTF8)
#define uspoof_areConfusableUnicodeString U_ICU_ENTRY_POINT_RENAME(uspoof_areConfusableUnicodeString)
@ -1778,6 +1828,9 @@
#define uspoof_getAllowedChars U_ICU_ENTRY_POINT_RENAME(uspoof_getAllowedChars)
#define uspoof_getAllowedLocales U_ICU_ENTRY_POINT_RENAME(uspoof_getAllowedLocales)
#define uspoof_getAllowedUnicodeSet U_ICU_ENTRY_POINT_RENAME(uspoof_getAllowedUnicodeSet)
#define uspoof_getBidiSkeleton U_ICU_ENTRY_POINT_RENAME(uspoof_getBidiSkeleton)
#define uspoof_getBidiSkeletonUTF8 U_ICU_ENTRY_POINT_RENAME(uspoof_getBidiSkeletonUTF8)
#define uspoof_getBidiSkeletonUnicodeString U_ICU_ENTRY_POINT_RENAME(uspoof_getBidiSkeletonUnicodeString)
#define uspoof_getCheckResultChecks U_ICU_ENTRY_POINT_RENAME(uspoof_getCheckResultChecks)
#define uspoof_getCheckResultNumerics U_ICU_ENTRY_POINT_RENAME(uspoof_getCheckResultNumerics)
#define uspoof_getCheckResultRestrictionLevel U_ICU_ENTRY_POINT_RENAME(uspoof_getCheckResultRestrictionLevel)

View file

@ -53,13 +53,13 @@
* This value will change in the subsequent releases of ICU
* @stable ICU 2.4
*/
#define U_ICU_VERSION_MAJOR_NUM 73
#define U_ICU_VERSION_MAJOR_NUM 74
/** The current ICU minor version as an integer.
* This value will change in the subsequent releases of ICU
* @stable ICU 2.6
*/
#define U_ICU_VERSION_MINOR_NUM 2
#define U_ICU_VERSION_MINOR_NUM 1
/** The current ICU patchlevel version as an integer.
* This value will change in the subsequent releases of ICU
@ -79,7 +79,7 @@
* This value will change in the subsequent releases of ICU
* @stable ICU 2.6
*/
#define U_ICU_VERSION_SUFFIX _73
#define U_ICU_VERSION_SUFFIX _74
/**
* \def U_DEF2_ICU_ENTRY_POINT_RENAME
@ -132,7 +132,7 @@
* This value will change in the subsequent releases of ICU
* @stable ICU 2.4
*/
#define U_ICU_VERSION "73.2"
#define U_ICU_VERSION "74.1"
/**
* The current ICU library major version number as a string, for library name suffixes.
@ -145,13 +145,13 @@
*
* @stable ICU 2.6
*/
#define U_ICU_VERSION_SHORT "73"
#define U_ICU_VERSION_SHORT "74"
#ifndef U_HIDE_INTERNAL_API
/** Data version in ICU4C.
* @internal ICU 4.4 Internal Use Only
**/
#define U_ICU_DATA_VERSION "73.2"
#define U_ICU_DATA_VERSION "74.1"
#endif /* U_HIDE_INTERNAL_API */
/*===========================================================================

View file

@ -10,6 +10,7 @@
#include "charstr.h"
#include "uassert.h"
#include "uhash.h"
#include "cmemory.h"
U_NAMESPACE_BEGIN
@ -47,22 +48,20 @@ public:
}
/**
* Adds a string and returns a unique number for it.
* The string's buffer contents must not change, nor move around in memory,
* Adds a NUL-terminated string and returns a unique number for it.
* The string must not change, nor move around in memory,
* while this UniqueCharStrings is in use.
* The string contents must be NUL-terminated exactly at s.length().
*
* Best used with read-only-alias UnicodeString objects that point to
* stable storage, such as strings returned by resource bundle functions.
* Best used with string data in a stable storage, such as strings returned
* by resource bundle functions.
*/
int32_t add(const UnicodeString &s, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return 0; }
int32_t add(const char16_t*p, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return -1; }
if (isFrozen) {
errorCode = U_NO_WRITE_PERMISSION;
return 0;
return -1;
}
// The string points into the resource bundle.
const char16_t *p = s.getBuffer();
int32_t oldIndex = uhash_geti(&map, p);
if (oldIndex != 0) { // found duplicate
return oldIndex;
@ -71,11 +70,33 @@ public:
// The strings object is also terminated with one implicit NUL.
strings->append(0, errorCode);
int32_t newIndex = strings->length();
strings->appendInvariantChars(s, errorCode);
strings->appendInvariantChars(p, u_strlen(p), errorCode);
uhash_puti(&map, const_cast<char16_t *>(p), newIndex, &errorCode);
return newIndex;
}
/**
* Adds a unicode string by value and returns a unique number for it.
*/
int32_t addByValue(UnicodeString s, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return -1; }
if (isFrozen) {
errorCode = U_NO_WRITE_PERMISSION;
return -1;
}
int32_t oldIndex = uhash_geti(&map, s.getTerminatedBuffer());
if (oldIndex != 0) { // found duplicate
return oldIndex;
}
// We need to store the string content of the UnicodeString.
UnicodeString *key = keyStore.create(s);
if (key == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return -1;
}
return add(key->getTerminatedBuffer(), errorCode);
}
void freeze() { isFrozen = true; }
/**
@ -90,6 +111,7 @@ public:
private:
UHashtable map;
CharString *strings;
MemoryPool<UnicodeString> keyStore;
bool isFrozen = false;
};

View file

@ -328,6 +328,53 @@ static UBool hasEmojiProperty(const BinaryProperty &/*prop*/, UChar32 c, UProper
return EmojiProps::hasBinaryProperty(c, which);
}
static UBool isIDSUnaryOperator(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
// New in Unicode 15.1 for just two characters.
return 0x2FFE<=c && c<=0x2FFF;
}
/** Ranges (start/limit pairs) of ID_Compat_Math_Continue (only), from UCD PropList.txt. */
static constexpr UChar32 ID_COMPAT_MATH_CONTINUE[] = {
0x00B2, 0x00B3 + 1,
0x00B9, 0x00B9 + 1,
0x2070, 0x2070 + 1,
0x2074, 0x207E + 1,
0x2080, 0x208E + 1
};
/** ID_Compat_Math_Start characters, from UCD PropList.txt. */
static constexpr UChar32 ID_COMPAT_MATH_START[] = {
0x2202,
0x2207,
0x221E,
0x1D6C1,
0x1D6DB,
0x1D6FB,
0x1D715,
0x1D735,
0x1D74F,
0x1D76F,
0x1D789,
0x1D7A9,
0x1D7C3
};
static UBool isIDCompatMathStart(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
if (c < ID_COMPAT_MATH_START[0]) { return false; } // fastpath for common scripts
for (UChar32 startChar : ID_COMPAT_MATH_START) {
if (c == startChar) { return true; }
}
return false;
}
static UBool isIDCompatMathContinue(const BinaryProperty &prop, UChar32 c, UProperty /*which*/) {
for (int32_t i = 0; i < UPRV_LENGTHOF(ID_COMPAT_MATH_CONTINUE); i += 2) {
if (c < ID_COMPAT_MATH_CONTINUE[i]) { return false; } // below range start
if (c < ID_COMPAT_MATH_CONTINUE[i + 1]) { return true; } // below range limit
}
return isIDCompatMathStart(prop, c, UCHAR_ID_COMPAT_MATH_START);
}
static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={
/*
* column and mask values for binary properties from u_getUnicodeProperties().
@ -409,6 +456,9 @@ static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={
{ UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_TAG_SEQUENCE
{ UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_ZWJ_SEQUENCE
{ UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI
{ UPROPS_SRC_IDSU, 0, isIDSUnaryOperator }, // UCHAR_IDS_UNARY_OPERATOR
{ UPROPS_SRC_ID_COMPAT_MATH, 0, isIDCompatMathStart }, // UCHAR_ID_COMPAT_MATH_START
{ UPROPS_SRC_ID_COMPAT_MATH, 0, isIDCompatMathContinue }, // UCHAR_ID_COMPAT_MATH_CONTINUE
};
U_CAPI UBool U_EXPORT2
@ -759,6 +809,19 @@ uprops_getSource(UProperty which) {
U_CFUNC void U_EXPORT2
uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *pErrorCode) {
if (U_FAILURE(*pErrorCode)) { return; }
if (src == UPROPS_SRC_ID_COMPAT_MATH) {
// range limits
for (UChar32 c : ID_COMPAT_MATH_CONTINUE) {
sa->add(sa->set, c);
}
// single characters
for (UChar32 c : ID_COMPAT_MATH_START) {
sa->add(sa->set, c);
sa->add(sa->set, c + 1);
}
return;
}
if (!ulayout_ensureData(*pErrorCode)) { return; }
const UCPTrie *trie;
switch (src) {

View file

@ -379,6 +379,8 @@ enum UPropertySource {
UPROPS_SRC_INSC,
UPROPS_SRC_VO,
UPROPS_SRC_EMOJI,
UPROPS_SRC_IDSU,
UPROPS_SRC_ID_COMPAT_MATH,
/** One more than the highest UPropertySource (UPROPS_SRC_) constant. */
UPROPS_SRC_COUNT
};

View file

@ -24,6 +24,7 @@
#include "unicode/ures.h"
#include "unicode/ustring.h"
#include "unicode/ucnv.h"
#include "bytesinkutil.h"
#include "charstr.h"
#include "uresimp.h"
#include "ustr_imp.h"
@ -2351,7 +2352,66 @@ struct GetAllChildrenSink : public ResourceSink {
aliasedValue.setData(aliasRB->getResData());
aliasedValue.setValidLocaleDataEntry(aliasRB->fValidLocaleDataEntry);
aliasedValue.setResource(aliasRB->fRes, ResourceTracer(aliasRB));
dest.put(key, aliasedValue, isRoot, errorCode);
if (aliasedValue.getType() != URES_TABLE) {
dest.put(key, aliasedValue, isRoot, errorCode);
} else {
// if the resource we're aliasing over to is a table, the sink might iterate over its contents.
// If it does, it'll get only the things defined in the actual alias target, not the things
// the target inherits from its parent resources. So we walk the parent chain for the *alias target*,
// calling dest.put() for each of the parent tables we could be inheriting from. This means
// that dest.put() has to iterate over the children of multiple tables to get all of the inherited
// resource values, but it already has to do that to handle normal vertical inheritance.
UResType aliasedValueType = URES_TABLE;
CharString tablePath;
tablePath.append(aliasRB->fResPath, errorCode);
const char* parentKey = key; // dest.put() changes the key
dest.put(parentKey, aliasedValue, isRoot, errorCode);
UResourceDataEntry* entry = aliasRB->fData;
Resource res = aliasRB->fRes;
while (aliasedValueType == URES_TABLE && entry->fParent != nullptr) {
CharString localPath;
localPath.copyFrom(tablePath, errorCode);
char* localPathAsCharPtr = localPath.data();
const char* childKey;
entry = entry->fParent;
res = entry->fData.rootRes;
Resource newRes = res_findResource(&entry->fData, res, &localPathAsCharPtr, &childKey);
if (newRes != RES_BOGUS) {
aliasedValue.setData(entry->fData);
// TODO: do I also need to call aliasedValue.setValueLocaleDataEntry() ?
aliasedValue.setResource(newRes, ResourceTracer(aliasRB)); // probably wrong to use aliasRB here
aliasedValueType = aliasedValue.getType();
if (aliasedValueType == URES_ALIAS) {
// in a few rare cases, when we get to the root resource bundle, the resource in question
// won't be an actual table, but will instead be an alias to a table. That is, we have
// two aliases in the inheritance path. (For some locales, such as Zulu, we see this with
// children of the "fields" resource: "day-narrow" aliases to "day-short", which aliases
// to "day".) When this happens, we need to make sure we follow all the aliases.
ResourceDataValue& rdv2 = static_cast<ResourceDataValue&>(aliasedValue);
aliasRB = getAliasTargetAsResourceBundle(rdv2.getData(), rdv2.getResource(), nullptr, -1,
rdv2.getValidLocaleDataEntry(), nullptr, 0,
stackTempBundle.getAlias(), &errorCode);
tablePath.clear();
tablePath.append(aliasRB->fResPath, errorCode);
entry = aliasRB->fData;
res = aliasRB->fRes;
aliasedValue.setData(entry->fData);
// TODO: do I also need to call aliasedValue.setValueLocaleDataEntry() ?
aliasedValue.setResource(res, ResourceTracer(aliasRB)); // probably wrong to use aliasRB here
aliasedValueType = aliasedValue.getType();
}
if (aliasedValueType == URES_TABLE) {
dest.put(parentKey, aliasedValue, isRoot, errorCode);
} else {
// once we've followed the alias, the resource we're looking at really should
// be a table
errorCode = U_INTERNAL_PROGRAM_ERROR;
return;
}
}
}
}
}
} else {
dest.put(key, value, isRoot, errorCode);
@ -2657,13 +2717,16 @@ ures_openWithType(UResourceBundle *r, const char* path, const char* localeID,
UResourceDataEntry *entry;
if(openType != URES_OPEN_DIRECT) {
/* first "canonicalize" the locale ID */
char canonLocaleID[ULOC_FULLNAME_CAPACITY];
uloc_getBaseName(localeID, canonLocaleID, UPRV_LENGTHOF(canonLocaleID), status);
if(U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING) {
CharString canonLocaleID;
{
CharStringByteSink sink(&canonLocaleID);
ulocimp_getBaseName(localeID, sink, status);
}
if(U_FAILURE(*status)) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
entry = entryOpen(path, canonLocaleID, openType, status);
entry = entryOpen(path, canonLocaleID.data(), openType, status);
} else {
entry = entryOpenDirect(path, localeID, status);
}
@ -2974,15 +3037,39 @@ static UBool isLocaleInList(UEnumeration *locEnum, const char *locToSearch, UErr
return false;
}
static void getParentForFunctionalEquivalent(const char* localeID,
UResourceBundle* res,
UResourceBundle* bund1,
char* parent,
int32_t parentCapacity) {
// Get parent.
// First check for a parent from %%Parent resource (Note that in resource trees
// such as collation, data may have different parents than in parentLocales).
UErrorCode subStatus = U_ZERO_ERROR;
parent[0] = '\0';
if (res != NULL) {
ures_getByKey(res, "%%Parent", bund1, &subStatus);
if (U_SUCCESS(subStatus)) {
int32_t parentLen = parentCapacity;
ures_getUTF8String(bund1, parent, &parentLen, true, &subStatus);
}
}
// If none there, use normal truncation parent
if (U_FAILURE(subStatus) || parent[0] == 0) {
subStatus = U_ZERO_ERROR;
uloc_getParent(localeID, parent, parentCapacity, &subStatus);
}
}
U_CAPI int32_t U_EXPORT2
ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
const char *path, const char *resName, const char *keyword, const char *locid,
UBool *isAvailable, UBool omitDefault, UErrorCode *status)
{
char kwVal[1024] = ""; /* value of keyword 'keyword' */
char defVal[1024] = ""; /* default value for given locale */
char defLoc[1024] = ""; /* default value for given locale */
char base[1024] = ""; /* base locale */
CharString base; /* base locale */
char found[1024] = "";
char parent[1024] = "";
char full[1024] = "";
@ -2991,23 +3078,29 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
UErrorCode subStatus = U_ZERO_ERROR;
int32_t length = 0;
if(U_FAILURE(*status)) return 0;
uloc_getKeywordValue(locid, keyword, kwVal, 1024-1,&subStatus);
if(!uprv_strcmp(kwVal, DEFAULT_TAG)) {
kwVal[0]=0;
CharString kwVal;
{
CharStringByteSink sink(&kwVal);
ulocimp_getKeywordValue(locid, keyword, sink, &subStatus);
}
if(kwVal == DEFAULT_TAG) {
kwVal.clear();
}
{
CharStringByteSink sink(&base);
ulocimp_getBaseName(locid, sink, &subStatus);
}
uloc_getBaseName(locid, base, 1024-1,&subStatus);
#if defined(URES_TREE_DEBUG)
fprintf(stderr, "getFunctionalEquivalent: \"%s\" [%s=%s] in %s - %s\n",
locid, keyword, kwVal, base, u_errorName(subStatus));
locid, keyword, kwVal.data(), base.data(), u_errorName(subStatus));
#endif
ures_initStackObject(&bund1);
ures_initStackObject(&bund2);
uprv_strcpy(parent, base);
uprv_strcpy(found, base);
if(isAvailable) {
base.extract(parent, UPRV_LENGTHOF(parent), subStatus);
base.extract(found, UPRV_LENGTHOF(found), subStatus);
if(isAvailable) {
UEnumeration *locEnum = ures_openAvailableLocales(path, &subStatus);
*isAvailable = true;
if (U_SUCCESS(subStatus)) {
@ -3054,11 +3147,11 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
path?path:"ICUDATA", parent, keyword, defVal, u_errorName(subStatus));
#endif
uprv_strcpy(defLoc, parent);
if(kwVal[0]==0) {
uprv_strcpy(kwVal, defVal);
if(kwVal.isEmpty()) {
kwVal.append(defVal, defLen, subStatus);
#if defined(URES_TREE_DEBUG)
fprintf(stderr, "%s;%s -> kwVal = %s\n",
path?path:"ICUDATA", parent, keyword, kwVal);
path?path:"ICUDATA", parent, keyword, kwVal.data());
#endif
}
}
@ -3071,16 +3164,19 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
uprv_strcpy(found, ures_getLocaleByType(res, ULOC_VALID_LOCALE, &subStatus));
}
uloc_getParent(found,parent,sizeof(parent),&subStatus);
if (uprv_strcmp(found, parent) != 0) {
uprv_strcpy(parent, found);
} else {
getParentForFunctionalEquivalent(found,res,&bund1,parent,sizeof(parent));
}
ures_close(res);
} while(!defVal[0] && *found && uprv_strcmp(found, "root") != 0 && U_SUCCESS(*status));
/* Now, see if we can find the kwVal collator.. start the search over.. */
uprv_strcpy(parent, base);
uprv_strcpy(found, base);
base.extract(parent, UPRV_LENGTHOF(parent), subStatus);
base.extract(found, UPRV_LENGTHOF(found), subStatus);
do {
subStatus = U_ZERO_ERROR;
res = ures_open(path, parent, &subStatus);
if((subStatus == U_USING_FALLBACK_WARNING) && isAvailable) {
*isAvailable = false;
@ -3089,7 +3185,7 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
#if defined(URES_TREE_DEBUG)
fprintf(stderr, "%s;%s -> %s (looking for %s)\n",
path?path:"ICUDATA", parent, u_errorName(subStatus), kwVal);
path?path:"ICUDATA", parent, u_errorName(subStatus), kwVal.data());
#endif
if(U_FAILURE(subStatus)) {
*status = subStatus;
@ -3099,14 +3195,14 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
/**/ fprintf(stderr,"@%d [%s] %s\n", __LINE__, resName, u_errorName(subStatus));
#endif
if(subStatus == U_ZERO_ERROR) {
ures_getByKey(&bund1, kwVal, &bund2, &subStatus);
ures_getByKey(&bund1, kwVal.data(), &bund2, &subStatus);
#if defined(URES_TREE_DEBUG)
/**/ fprintf(stderr,"@%d [%s] %s\n", __LINE__, kwVal, u_errorName(subStatus));
/**/ fprintf(stderr,"@%d [%s] %s\n", __LINE__, kwVal.data(), u_errorName(subStatus));
#endif
if(subStatus == U_ZERO_ERROR) {
#if defined(URES_TREE_DEBUG)
fprintf(stderr, "%s;%s -> full0 %s=%s, %s\n",
path?path:"ICUDATA", parent, keyword, kwVal, u_errorName(subStatus));
path?path:"ICUDATA", parent, keyword, kwVal.data(), u_errorName(subStatus));
#endif
uprv_strcpy(full, parent);
if(*full == 0) {
@ -3139,29 +3235,52 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
} else {
#if defined(URES_TREE_DEBUG)
fprintf(stderr, "err=%s in %s looking for %s\n",
u_errorName(subStatus), parent, kwVal);
u_errorName(subStatus), parent, kwVal.data());
#endif
}
}
}
subStatus = U_ZERO_ERROR;
uprv_strcpy(found, parent);
uloc_getParent(found,parent,1023,&subStatus);
ures_close(res);
} while(!full[0] && *found && U_SUCCESS(*status));
if((full[0]==0) && uprv_strcmp(kwVal, defVal)) {
#if defined(URES_TREE_DEBUG)
fprintf(stderr, "Failed to locate kw %s - try default %s\n", kwVal, defVal);
#endif
uprv_strcpy(kwVal, defVal);
uprv_strcpy(parent, base);
uprv_strcpy(found, base);
do { /* search for 'default' named item */
UBool haveFound = false;
// At least for collations which may be aliased, we need to use the VALID locale
// as the parent instead of just truncating, as long as the VALID locale is not
// root and has a different language than the parent. Use of the VALID locale
// here is similar to the procedure used at the end of the previous do-while loop
// for all resource types.
if (res != NULL && uprv_strcmp(resName, "collations") == 0) {
subStatus = U_ZERO_ERROR;
const char *validLoc = ures_getLocaleByType(res, ULOC_VALID_LOCALE, &subStatus);
if (U_SUCCESS(subStatus) && validLoc != NULL && validLoc[0] != 0 && uprv_strcmp(validLoc, "root") != 0) {
char validLang[ULOC_LANG_CAPACITY];
char parentLang[ULOC_LANG_CAPACITY];
uloc_getLanguage(validLoc, validLang, ULOC_LANG_CAPACITY, &subStatus);
uloc_getLanguage(parent, parentLang, ULOC_LANG_CAPACITY, &subStatus);
if (U_SUCCESS(subStatus) && uprv_strcmp(validLang, parentLang) != 0) {
// validLoc is not root and has a different language than parent, use it instead
uprv_strcpy(found, validLoc);
haveFound = true;
}
}
subStatus = U_ZERO_ERROR;
}
if (!haveFound) {
uprv_strcpy(found, parent);
}
getParentForFunctionalEquivalent(found,res,&bund1,parent,1023);
ures_close(res);
subStatus = U_ZERO_ERROR;
} while(!full[0] && *found && U_SUCCESS(*status));
if((full[0]==0) && kwVal != defVal) {
#if defined(URES_TREE_DEBUG)
fprintf(stderr, "Failed to locate kw %s - try default %s\n", kwVal.data(), defVal);
#endif
kwVal.clear().append(defVal, subStatus);
base.extract(parent, UPRV_LENGTHOF(parent), subStatus);
base.extract(found, UPRV_LENGTHOF(found), subStatus);
do { /* search for 'default' named item */
res = ures_open(path, parent, &subStatus);
if((subStatus == U_USING_FALLBACK_WARNING) && isAvailable) {
*isAvailable = false;
@ -3170,18 +3289,18 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
#if defined(URES_TREE_DEBUG)
fprintf(stderr, "%s;%s -> %s (looking for default %s)\n",
path?path:"ICUDATA", parent, u_errorName(subStatus), kwVal);
path?path:"ICUDATA", parent, u_errorName(subStatus), kwVal.data());
#endif
if(U_FAILURE(subStatus)) {
*status = subStatus;
} else if(subStatus == U_ZERO_ERROR) {
ures_getByKey(res,resName,&bund1, &subStatus);
if(subStatus == U_ZERO_ERROR) {
ures_getByKey(&bund1, kwVal, &bund2, &subStatus);
ures_getByKey(&bund1, kwVal.data(), &bund2, &subStatus);
if(subStatus == U_ZERO_ERROR) {
#if defined(URES_TREE_DEBUG)
fprintf(stderr, "%s;%s -> full1 %s=%s, %s\n", path?path:"ICUDATA",
parent, keyword, kwVal, u_errorName(subStatus));
parent, keyword, kwVal.data(), u_errorName(subStatus));
#endif
uprv_strcpy(full, parent);
if(*full == 0) {
@ -3215,18 +3334,18 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
}
}
}
subStatus = U_ZERO_ERROR;
uprv_strcpy(found, parent);
uloc_getParent(found,parent,1023,&subStatus);
getParentForFunctionalEquivalent(found,res,&bund1,parent,1023);
ures_close(res);
subStatus = U_ZERO_ERROR;
} while(!full[0] && *found && U_SUCCESS(*status));
}
if(U_SUCCESS(*status)) {
if(!full[0]) {
#if defined(URES_TREE_DEBUG)
fprintf(stderr, "Still could not load keyword %s=%s\n", keyword, kwVal);
fprintf(stderr, "Still could not load keyword %s=%s\n", keyword, kwVal.data());
#endif
*status = U_MISSING_RESOURCE_ERROR;
} else if(omitDefault) {
@ -3235,21 +3354,21 @@ ures_getFunctionalEquivalent(char *result, int32_t resultCapacity,
#endif
if(uprv_strlen(defLoc) <= uprv_strlen(full)) {
/* found the keyword in a *child* of where the default tag was present. */
if(!uprv_strcmp(kwVal, defVal)) { /* if the requested kw is default, */
if(kwVal == defVal) { /* if the requested kw is default, */
/* and the default is in or in an ancestor of the current locale */
#if defined(URES_TREE_DEBUG)
fprintf(stderr, "Removing unneeded var %s=%s\n", keyword, kwVal);
fprintf(stderr, "Removing unneeded var %s=%s\n", keyword, kwVal.data());
#endif
kwVal[0]=0;
kwVal.clear();
}
}
}
uprv_strcpy(found, full);
if(kwVal[0]) {
if(!kwVal.isEmpty()) {
uprv_strcat(found, "@");
uprv_strcat(found, keyword);
uprv_strcat(found, "=");
uprv_strcat(found, kwVal);
uprv_strcat(found, kwVal.data());
} else if(!omitDefault) {
uprv_strcat(found, "@");
uprv_strcat(found, keyword);

View file

@ -1130,14 +1130,18 @@ int32_t toUpper(uint32_t options,
// Adding one only to the final vowel in a longer sequence
// (which does not occur in normal writing) would require lookahead.
// Set the same flag as for preserving an existing dialytika.
if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
(upper == 0x399 || upper == 0x3A5)) {
data |= HAS_DIALYTIKA;
if ((data & HAS_VOWEL) != 0 &&
(state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) !=
0 &&
(upper == 0x399 || upper == 0x3A5)) {
data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) ? HAS_DIALYTIKA
: HAS_COMBINING_DIALYTIKA;
}
int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
if ((data & HAS_YPOGEGRAMMENI) != 0) {
numYpogegrammeni = 1;
}
const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0;
// Skip combining diacritics after this Greek letter.
while (nextIndex < srcLength) {
uint32_t diacriticData = getDiacriticData(src[nextIndex]);
@ -1152,7 +1156,8 @@ int32_t toUpper(uint32_t options,
}
}
if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
nextState |= AFTER_VOWEL_WITH_ACCENT;
nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
: AFTER_VOWEL_WITH_COMBINING_ACCENT;
}
// Map according to Greek rules.
UBool addTonos = false;
@ -1163,7 +1168,7 @@ int32_t toUpper(uint32_t options,
!isFollowedByCasedLetter(src, nextIndex, srcLength)) {
// Keep disjunctive "or" with (only) a tonos.
// We use the same "word boundary" conditions as for the Final_Sigma test.
if (i == nextIndex) {
if (hasPrecomposedAccent) {
upper = 0x389; // Preserve the precomposed form.
} else {
addTonos = true;

View file

@ -669,14 +669,6 @@ UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart
return length;
}
// Some non-ASCII characters are equivalent to sequences with
// non-LDH ASCII characters. To find them:
// grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt)
static inline UBool
isNonASCIIDisallowedSTD3Valid(UChar32 c) {
return c==0x2260 || c==0x226E || c==0x226F;
}
// Replace the label in dest with the label string, if the label was modified.
// If &label==&dest then the label was modified in-place and labelLength
// is the new label length, different from label.length().
@ -820,10 +812,7 @@ UTS46::processLabel(UnicodeString &dest,
}
} else {
oredChars|=c;
if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) {
info.labelErrors|=UIDNA_ERROR_DISALLOWED;
*s=0xfffd;
} else if(c==0xfffd) {
if(c==0xfffd) {
info.labelErrors|=UIDNA_ERROR_DISALLOWED;
}
}

View file

@ -19,6 +19,7 @@
#ifndef USPOOF_H
#define USPOOF_H
#include "unicode/ubidi.h"
#include "unicode/utypes.h"
#include "unicode/uset.h"
#include "unicode/parseerr.h"
@ -83,6 +84,25 @@
* the instance should be created once (e.g., upon application startup), and the efficient
* {@link uspoof_areConfusable} method can be used at runtime.
*
* If the paragraph direction used to display the strings is known, the bidi function should be used instead:
*
* \code{.c}
* UErrorCode status = U_ZERO_ERROR;
* // These strings look identical when rendered in a left-to-right context.
* // They look distinct in a right-to-left context.
* UChar* str1 = (UChar*) u"A1\u05D0"; // A1א
* UChar* str2 = (UChar*) u"A\u05D01"; // Aא1
*
* USpoofChecker* sc = uspoof_open(&status);
* uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
*
* int32_t bitmask = uspoof_areBidiConfusable(sc, UBIDI_LTR, str1, -1, str2, -1, &status);
* UBool result = bitmask != 0;
* // areBidiConfusable: 1 (status: U_ZERO_ERROR)
* printf("areBidiConfusable: %d (status: %s)\n", result, u_errorName(status));
* uspoof_close(sc);
* \endcode
*
* <p>
* The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call
* {@link uspoof_close} when the object goes out of scope:
@ -339,6 +359,51 @@
* COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple
* scripts.
*
* <h2>Advanced bidirectional usage</h2>
* If the paragraph direction with which the identifiers will be displayed is not known, there are
* multiple options for confusable detection depending on the circumstances.
*
* <p>
* In some circumstances, the only concern is confusion between identifiers displayed with the same
* paragraph direction.
*
* <p>
* An example is the case where identifiers are usernames prefixed with the @ symbol.
* That symbol will appear to the left in a left-to-right context, and to the right in a
* right-to-left context, so that an identifier displayed in a left-to-right context can never be
* confused with an identifier displayed in a right-to-left context:
* <ul>
* <li>
* The usernames "A1א" (A one aleph) and "Aא1" (A aleph 1)
* would be considered confusable, since they both appear as \@A1א in a left-to-right context, and the
* usernames "אA_1" (aleph A underscore one) and "א1_A" (aleph one underscore A) would be considered
* confusable, since they both appear as A_1א@ in a right-to-left context.
* </li>
* <li>
* The username "Mark_" would not be considered confusable with the username "_Mark",
* even though the latter would appear as Mark_@ in a right-to-left context, and the
* former as \@Mark_ in a left-to-right context.
* </li>
* </ul>
* <p>
* In that case, the caller should check for both LTR-confusability and RTL-confusability:
*
* \code{.cpp}
* bool confusableInEitherDirection =
* uspoof_areBidiConfusableUnicodeString(sc, UBIDI_LTR, id1, id2, &status) ||
* uspoof_areBidiConfusableUnicodeString(sc, UBIDI_RTL, id1, id2, &status);
* \endcode
*
* If the bidiSkeleton is used, the LTR and RTL skeleta should be kept separately and compared, LTR
* with LTR and RTL with RTL.
*
* <p>
* In cases where confusability between the visual appearances of an identifier displayed in a
* left-to-right context with another identifier displayed in a right-to-left context is a concern,
* the LTR skeleton of one can be compared with the RTL skeleton of the other. However, this
* very broad definition of confusability may have unexpected results; for instance, it treats the
* ASCII identifiers "Mark_" and "_Mark" as confusable.
*
* <h2>Additional Information</h2>
*
* A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
@ -519,7 +584,7 @@ typedef enum USpoofChecks {
/**
* Constants from UAX #39 for use in {@link uspoof_setRestrictionLevel}, and
* Constants from UTS #39 for use in {@link uspoof_setRestrictionLevel}, and
* for returned identifier restriction levels in check results.
*
* @stable ICU 51
@ -633,8 +698,8 @@ uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLeng
/**
* Open a Spoof Checker from the source form of the spoof data.
* The input corresponds to the Unicode data file confusables.txt
* as described in Unicode UAX #39. The syntax of the source data
* is as described in UAX #39 for this file, and the content of
* as described in Unicode Technical Standard #39. The syntax of the source data
* is as described in UTS #39 for this file, and the content of
* this file is acceptable input.
*
* The character encoding of the (char *) input text is UTF-8.
@ -1111,7 +1176,7 @@ uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *
/**
* Check the whether two specified strings are visually confusable.
* Check whether two specified strings are visually confusable.
*
* If the strings are confusable, the return value will be nonzero, as long as
* {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
@ -1159,7 +1224,58 @@ uspoof_areConfusable(const USpoofChecker *sc,
const UChar *id2, int32_t length2,
UErrorCode *status);
#ifndef U_HIDE_DRAFT_API
/**
* Check whether two specified strings are visually confusable when
* displayed in a context with the given paragraph direction.
*
* If the strings are confusable, the return value will be nonzero, as long as
* {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
*
* The bits in the return value correspond to flags for each of the classes of
* confusables applicable to the two input strings. According to UTS 39
* section 4, the possible flags are:
*
* <ul>
* <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
* <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
* <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
* </ul>
*
* If one or more of the above flags were not listed in uspoof_setChecks(), this
* function will never report that class of confusable. The check
* {@link USPOOF_CONFUSABLE} enables all three flags.
*
*
* @param sc The USpoofChecker
* @param direction The paragraph direction with which the identifiers are
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
* @param id1 The first of the two identifiers to be compared for
* confusability. The strings are in UTF-16 format.
* @param length1 the length of the first identifier, expressed in
* 16 bit UTF-16 code units, or -1 if the string is
* nul terminated.
* @param id2 The second of the two identifiers to be compared for
* confusability. The identifiers are in UTF-16 format.
* @param length2 The length of the second identifiers, expressed in
* 16 bit UTF-16 code units, or -1 if the string is
* nul terminated.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Confusability of the identifiers is not reported here,
* but through this function's return value.
* @return An integer value with bit(s) set corresponding to
* the type of confusability found, as defined by
* enum USpoofChecks. Zero is returned if the identifiers
* are not confusable.
*
* @draft ICU 74
*/
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusable(const USpoofChecker *sc, UBiDiDirection direction,
const UChar *id1, int32_t length1,
const UChar *id2, int32_t length2,
UErrorCode *status);
#endif /* U_HIDE_DRAFT_API */
/**
* A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format.
@ -1192,14 +1308,45 @@ uspoof_areConfusableUTF8(const USpoofChecker *sc,
const char *id2, int32_t length2,
UErrorCode *status);
#ifndef U_HIDE_DRAFT_API
/**
* A version of {@link uspoof_areBidiConfusable} accepting strings in UTF-8 format.
*
* @param sc The USpoofChecker
* @param direction The paragraph direction with which the identifiers are
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
* @param id1 The first of the two identifiers to be compared for
* confusability. The strings are in UTF-8 format.
* @param length1 the length of the first identifiers, in bytes, or -1
* if the string is nul terminated.
* @param id2 The second of the two identifiers to be compared for
* confusability. The strings are in UTF-8 format.
* @param length2 The length of the second string in bytes, or -1
* if the string is nul terminated.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Confusability of the strings is not reported here,
* but through this function's return value.
* @return An integer value with bit(s) set corresponding to
* the type of confusability found, as defined by
* enum USpoofChecks. Zero is returned if the strings
* are not confusable.
*
* @draft ICU 74
*
* @see uspoof_areBidiConfusable
*/
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUTF8(const USpoofChecker *sc, UBiDiDirection direction,
const char *id1, int32_t length1,
const char *id2, int32_t length2,
UErrorCode *status);
#endif /* U_HIDE_DRAFT_API */
/**
* Get the "skeleton" for an identifier.
* Skeletons are a transformation of the input identifier;
* Two identifiers are confusable if their skeletons are identical.
* See Unicode UAX #39 for additional information.
* See Unicode Technical Standard #39 for additional information.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
@ -1233,11 +1380,50 @@ uspoof_getSkeleton(const USpoofChecker *sc,
UChar *dest, int32_t destCapacity,
UErrorCode *status);
#ifndef U_HIDE_DRAFT_API
/**
* Get the "bidiSkeleton" for an identifier and a direction.
* Skeletons are a transformation of the input identifier;
* Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
* they are RTL-confusable if their RTL bidiSkeletons are identical.
* See Unicode Technical Standard #39 for additional information:
* https://www.unicode.org/reports/tr39/#Confusable_Detection.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
* set of existing identifiers, by creating an efficiently
* searchable collection of the skeletons.
*
* @param sc The USpoofChecker.
* @param direction The context direction with which the identifier will be
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
* @param id The input identifier whose skeleton will be computed.
* @param length The length of the input identifier, expressed in 16 bit
* UTF-16 code units, or -1 if the string is zero terminated.
* @param dest The output buffer, to receive the skeleton string.
* @param destCapacity The length of the output buffer, in 16 bit units.
* The destCapacity may be zero, in which case the function will
* return the actual length of the skeleton.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* @return The length of the skeleton string. The returned length
* is always that of the complete skeleton, even when the
* supplied buffer is too small (or of zero length)
*
* @draft ICU 74
* @see uspoof_areBidiConfusable
*/
U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeleton(const USpoofChecker *sc,
UBiDiDirection direction,
const UChar *id, int32_t length,
UChar *dest, int32_t destCapacity, UErrorCode *status);
#endif /* U_HIDE_DRAFT_API */
/**
* Get the "skeleton" for an identifier.
* Skeletons are a transformation of the input identifier;
* Two identifiers are confusable if their skeletons are identical.
* See Unicode UAX #39 for additional information.
* See Unicode Technical Standard #39 for additional information.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
@ -1273,6 +1459,46 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
char *dest, int32_t destCapacity,
UErrorCode *status);
#ifndef U_HIDE_DRAFT_API
/**
* Get the "bidiSkeleton" for an identifier and a direction.
* Skeletons are a transformation of the input identifier;
* Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
* they are RTL-confusable if their RTL bidiSkeletons are identical.
* See Unicode Technical Standard #39 for additional information:
* https://www.unicode.org/reports/tr39/#Confusable_Detection.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
* set of existing identifiers, by creating an efficiently
* searchable collection of the skeletons.
*
* @param sc The USpoofChecker
* @param direction The context direction with which the identifier will be
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
* @param id The UTF-8 format identifier whose skeleton will be computed.
* @param length The length of the input string, in bytes,
* or -1 if the string is zero terminated.
* @param dest The output buffer, to receive the skeleton string.
* @param destCapacity The length of the output buffer, in bytes.
* The destCapacity may be zero, in which case the function will
* return the actual length of the skeleton.
* @param status The error code, set if an error occurred while attempting to
* perform the check. Possible Errors include U_INVALID_CHAR_FOUND
* for invalid UTF-8 sequences, and
* U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small
* to hold the complete skeleton.
* @return The length of the skeleton string, in bytes. The returned length
* is always that of the complete skeleton, even when the
* supplied buffer is too small (or of zero length)
*
* @draft ICU 74
*/
U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeletonUTF8(const USpoofChecker *sc, UBiDiDirection direction,
const char *id, int32_t length, char *dest,
int32_t destCapacity, UErrorCode *status);
#endif /* U_HIDE_DRAFT_API */
/**
* Get the set of Candidate Characters for Inclusion in Identifiers, as defined
* in http://unicode.org/Public/security/latest/xidmodifications.txt
@ -1510,11 +1736,42 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
const icu::UnicodeString &s2,
UErrorCode *status);
#ifndef U_HIDE_DRAFT_API
/**
* A version of {@link uspoof_areBidiConfusable} accepting UnicodeStrings.
*
* @param sc The USpoofChecker
* @param direction The paragraph direction with which the identifiers are
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
* @param s1 The first of the two identifiers to be compared for
* confusability. The strings are in UTF-8 format.
* @param s2 The second of the two identifiers to be compared for
* confusability. The strings are in UTF-8 format.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Confusability of the identifiers is not reported here,
* but through this function's return value.
* @return An integer value with bit(s) set corresponding to
* the type of confusability found, as defined by
* enum USpoofChecks. Zero is returned if the identifiers
* are not confusable.
*
* @draft ICU 74
*
* @see uspoof_areBidiConfusable
*/
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUnicodeString(const USpoofChecker *sc,
UBiDiDirection direction,
const icu::UnicodeString &s1,
const icu::UnicodeString &s2,
UErrorCode *status);
#endif /* U_HIDE_DRAFT_API */
/**
* Get the "skeleton" for an identifier.
* Skeletons are a transformation of the input identifier;
* Two identifiers are confusable if their skeletons are identical.
* See Unicode UAX #39 for additional information.
* See Unicode Technical Standard #39 for additional information.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
@ -1540,6 +1797,36 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
icu::UnicodeString &dest,
UErrorCode *status);
#ifndef U_HIDE_DRAFT_API
/**
* Get the "bidiSkeleton" for an identifier and a direction.
* Skeletons are a transformation of the input identifier;
* Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
* they are RTL-confusable if their RTL bidiSkeletons are identical.
* See Unicode Technical Standard #39 for additional information.
* https://www.unicode.org/reports/tr39/#Confusable_Detection.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
* set of existing identifiers, by creating an efficiently
* searchable collection of the skeletons.
*
* @param sc The USpoofChecker.
* @param direction The context direction with which the identifier will be
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
* @param id The input identifier whose bidiSkeleton will be computed.
* @param dest The output identifier, to receive the skeleton string.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* @return A reference to the destination (skeleton) string.
*
* @draft ICU 74
*/
U_I18N_API icu::UnicodeString &U_EXPORT2 uspoof_getBidiSkeletonUnicodeString(
const USpoofChecker *sc, UBiDiDirection direction, const icu::UnicodeString &id,
icu::UnicodeString &dest, UErrorCode *status);
#endif /* U_HIDE_DRAFT_API */
/**
* Get the set of Candidate Characters for Inclusion in Identifiers, as defined
* in http://unicode.org/Public/security/latest/xidmodifications.txt

View file

@ -15,6 +15,7 @@
*
* Unicode Spoof Detection
*/
#include "unicode/ubidi.h"
#include "unicode/utypes.h"
#include "unicode/normalizer2.h"
#include "unicode/uspoof.h"
@ -141,8 +142,8 @@ void U_CALLCONV initializeStatics(UErrorCode &status) {
u"\\U0001DF00-\\U0001DF1E\\U0001DF25-\\U0001DF2A\\U0001E08F\\U0001E7E0-"
u"\\U0001E7E6\\U0001E7E8-\\U0001E7EB\\U0001E7ED\\U0001E7EE\\U0001E7F0-"
u"\\U0001E7FE\\U00020000-\\U0002A6DF\\U0002A700-\\U0002B739\\U0002B740-"
u"\\U0002B81D\\U0002B820-\\U0002CEA1\\U0002CEB0-\\U0002EBE0\\U00030000-"
u"\\U0003134A\\U00031350-\\U000323AF]";
u"\\U0002B81D\\U0002B820-\\U0002CEA1\\U0002CEB0-\\U0002EBE0\\U0002EBF0-"
u"\\U0002EE5D\\U00030000-\\U0003134A\\U00031350-\\U000323AF]";
gRecommendedSet = new UnicodeSet(UnicodeString(recommendedPat), status);
if (gRecommendedSet == nullptr) {
@ -538,6 +539,90 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
return result;
}
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusable(const USpoofChecker *sc, UBiDiDirection direction,
const char16_t *id1, int32_t length1,
const char16_t *id2, int32_t length2,
UErrorCode *status) {
UnicodeString id1Str((length1 == -1), id1, length1); // Aliasing constructor
UnicodeString id2Str((length2 == -1), id2, length2); // Aliasing constructor
if (id1Str.isBogus() || id2Str.isBogus()) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
return uspoof_areBidiConfusableUnicodeString(sc, direction, id1Str, id2Str, status);
}
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUTF8(const USpoofChecker *sc, UBiDiDirection direction,
const char *id1, int32_t length1, const char *id2,
int32_t length2, UErrorCode *status) {
if (length1 < -1 || length2 < -1) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UnicodeString id1Str = UnicodeString::fromUTF8(
StringPiece(id1, length1 >= 0 ? length1 : static_cast<int32_t>(uprv_strlen(id1))));
UnicodeString id2Str = UnicodeString::fromUTF8(
StringPiece(id2, length2 >= 0 ? length2 : static_cast<int32_t>(uprv_strlen(id2))));
return uspoof_areBidiConfusableUnicodeString(sc, direction, id1Str, id2Str, status);
}
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUnicodeString(const USpoofChecker *sc,
UBiDiDirection direction,
const icu::UnicodeString &id1,
const icu::UnicodeString &id2,
UErrorCode *status) {
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
if (U_FAILURE(*status)) {
return 0;
}
//
// See section 4 of UTS 39 for the algorithm for checking whether two strings are confusable,
// and for definitions of the types (single, whole, mixed-script) of confusables.
// We only care about a few of the check flags. Ignore the others.
// If no tests relevant to this function have been specified, return an error.
// TODO: is this really the right thing to do? It's probably an error on the caller's part,
// but logically we would just return 0 (no error).
if ((This->fChecks & USPOOF_CONFUSABLE) == 0) {
*status = U_INVALID_STATE_ERROR;
return 0;
}
// Compute the skeletons and check for confusability.
UnicodeString id1Skeleton;
uspoof_getBidiSkeletonUnicodeString(sc, direction, id1, id1Skeleton, status);
UnicodeString id2Skeleton;
uspoof_getBidiSkeletonUnicodeString(sc, direction, id2, id2Skeleton, status);
if (U_FAILURE(*status)) {
return 0;
}
if (id1Skeleton != id2Skeleton) {
return 0;
}
// If we get here, the strings are confusable. Now we just need to set the flags for the appropriate
// classes of confusables according to UTS 39 section 4. Start by computing the resolved script sets
// of id1 and id2.
ScriptSet id1RSS;
This->getResolvedScriptSet(id1, id1RSS, *status);
ScriptSet id2RSS;
This->getResolvedScriptSet(id2, id2RSS, *status);
// Turn on all applicable flags
uint32_t result = 0;
if (id1RSS.intersects(id2RSS)) {
result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
} else {
result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
if (!id1RSS.isEmpty() && !id2RSS.isEmpty()) {
result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
}
}
// Turn off flags that the user doesn't want
return result & This->fChecks;
}
U_CAPI int32_t U_EXPORT2
uspoof_checkUnicodeString(const USpoofChecker *sc,
@ -697,6 +782,60 @@ uspoof_getSkeleton(const USpoofChecker *sc,
return destStr.length();
}
U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeleton(const USpoofChecker *sc, UBiDiDirection direction,
const UChar *id, int32_t length, UChar *dest,
int32_t destCapacity, UErrorCode *status) {
UnicodeString idStr((length == -1), id, length); // Aliasing constructor
if (idStr.isBogus()) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UnicodeString destStr;
uspoof_getBidiSkeletonUnicodeString(sc, direction, idStr, destStr, status);
return destStr.extract(dest, destCapacity, *status);
}
U_I18N_API UnicodeString &U_EXPORT2 uspoof_getBidiSkeletonUnicodeString(const USpoofChecker *sc,
UBiDiDirection direction,
const UnicodeString &id,
UnicodeString &dest,
UErrorCode *status) {
dest.remove();
if (direction != UBIDI_LTR && direction != UBIDI_RTL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return dest;
}
UBiDi *bidi = ubidi_open();
ubidi_setPara(bidi, id.getBuffer(), id.length(), direction,
/*embeddingLevels*/ nullptr, status);
if (U_FAILURE(*status)) {
ubidi_close(bidi);
return dest;
}
UnicodeString reordered;
int32_t const size = ubidi_getProcessedLength(bidi);
UChar* const reorderedBuffer = reordered.getBuffer(size);
if (reorderedBuffer == nullptr) {
*status = U_MEMORY_ALLOCATION_ERROR;
ubidi_close(bidi);
return dest;
}
ubidi_writeReordered(bidi, reorderedBuffer, size,
UBIDI_KEEP_BASE_COMBINING | UBIDI_DO_MIRRORING, status);
reordered.releaseBuffer(size);
ubidi_close(bidi);
if (U_FAILURE(*status)) {
return dest;
}
// The type parameter is deprecated since ICU 58; any number may be passed.
constexpr uint32_t deprecatedType = 58;
return uspoof_getSkeletonUnicodeString(sc, deprecatedType, reordered, dest, status);
}
U_I18N_API UnicodeString & U_EXPORT2
@ -721,19 +860,17 @@ uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
for (inputIndex=0; inputIndex < normalizedLen; ) {
UChar32 c = nfdId.char32At(inputIndex);
inputIndex += U16_LENGTH(c);
This->fSpoofData->confusableLookup(c, skelStr);
if (!u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
This->fSpoofData->confusableLookup(c, skelStr);
}
}
gNfdNormalizer->normalize(skelStr, dest, *status);
return dest;
}
U_CAPI int32_t U_EXPORT2
uspoof_getSkeletonUTF8(const USpoofChecker *sc,
uint32_t type,
const char *id, int32_t length,
char *dest, int32_t destCapacity,
U_CAPI int32_t U_EXPORT2 uspoof_getSkeletonUTF8(const USpoofChecker *sc, uint32_t type, const char *id,
int32_t length, char *dest, int32_t destCapacity,
UErrorCode *status) {
SpoofImpl::validateThis(sc, *status);
if (U_FAILURE(*status)) {
@ -744,7 +881,8 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
return 0;
}
UnicodeString srcStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : static_cast<int32_t>(uprv_strlen(id))));
UnicodeString srcStr = UnicodeString::fromUTF8(
StringPiece(id, length >= 0 ? length : static_cast<int32_t>(uprv_strlen(id))));
UnicodeString destStr;
uspoof_getSkeletonUnicodeString(sc, type, srcStr, destStr, status);
if (U_FAILURE(*status)) {
@ -752,8 +890,28 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
}
int32_t lengthInUTF8 = 0;
u_strToUTF8(dest, destCapacity, &lengthInUTF8,
destStr.getBuffer(), destStr.length(), status);
u_strToUTF8(dest, destCapacity, &lengthInUTF8, destStr.getBuffer(), destStr.length(), status);
return lengthInUTF8;
}
U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeletonUTF8(const USpoofChecker *sc, UBiDiDirection direction,
const char *id, int32_t length, char *dest,
int32_t destCapacity, UErrorCode *status) {
if (length < -1) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UnicodeString srcStr = UnicodeString::fromUTF8(
StringPiece(id, length >= 0 ? length : static_cast<int32_t>(uprv_strlen(id))));
UnicodeString destStr;
uspoof_getBidiSkeletonUnicodeString(sc, direction, srcStr, destStr, status);
if (U_FAILURE(*status)) {
return 0;
}
int32_t lengthInUTF8 = 0;
u_strToUTF8(dest, destCapacity, &lengthInUTF8, destStr.getBuffer(), destStr.length(), status);
return lengthInUTF8;
}