959 lines
31 KiB
C++
959 lines
31 KiB
C++
// © 2016 and later: Unicode, Inc. and others.
|
||
// License & terms of use: http://www.unicode.org/copyright.html
|
||
/*
|
||
**********************************************************************
|
||
* Copyright (C) 2008-2016, International Business Machines
|
||
* Corporation and others. All Rights Reserved.
|
||
**********************************************************************
|
||
*/
|
||
|
||
#include "unicode/utypes.h"
|
||
#include "unicode/uspoof.h"
|
||
#include "unicode/uchar.h"
|
||
#include "unicode/uniset.h"
|
||
#include "unicode/utf16.h"
|
||
#include "utrie2.h"
|
||
#include "cmemory.h"
|
||
#include "cstring.h"
|
||
#include "scriptset.h"
|
||
#include "umutex.h"
|
||
#include "udataswp.h"
|
||
#include "uassert.h"
|
||
#include "ucln_in.h"
|
||
#include "uspoof_impl.h"
|
||
|
||
#if !UCONFIG_NO_NORMALIZATION
|
||
|
||
|
||
U_NAMESPACE_BEGIN
|
||
|
||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
|
||
|
||
SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
|
||
construct(status);
|
||
fSpoofData = data;
|
||
}
|
||
|
||
SpoofImpl::SpoofImpl(UErrorCode& status) {
|
||
construct(status);
|
||
|
||
// TODO: Call this method where it is actually needed, instead of in the
|
||
// constructor, to allow for lazy data loading. See #12696.
|
||
fSpoofData = SpoofData::getDefault(status);
|
||
}
|
||
|
||
SpoofImpl::SpoofImpl() {
|
||
UErrorCode status = U_ZERO_ERROR;
|
||
construct(status);
|
||
|
||
// TODO: Call this method where it is actually needed, instead of in the
|
||
// constructor, to allow for lazy data loading. See #12696.
|
||
fSpoofData = SpoofData::getDefault(status);
|
||
}
|
||
|
||
void SpoofImpl::construct(UErrorCode& status) {
|
||
fChecks = USPOOF_ALL_CHECKS;
|
||
fSpoofData = NULL;
|
||
fAllowedCharsSet = NULL;
|
||
fAllowedLocales = NULL;
|
||
fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
|
||
|
||
if (U_FAILURE(status)) { return; }
|
||
|
||
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
|
||
fAllowedCharsSet = allowedCharsSet;
|
||
fAllowedLocales = uprv_strdup("");
|
||
if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
|
||
status = U_MEMORY_ALLOCATION_ERROR;
|
||
return;
|
||
}
|
||
allowedCharsSet->freeze();
|
||
}
|
||
|
||
|
||
// Copy Constructor, used by the user level clone() function.
|
||
SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
|
||
fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
|
||
fAllowedLocales(NULL) {
|
||
if (U_FAILURE(status)) {
|
||
return;
|
||
}
|
||
fChecks = src.fChecks;
|
||
if (src.fSpoofData != NULL) {
|
||
fSpoofData = src.fSpoofData->addReference();
|
||
}
|
||
fAllowedCharsSet = src.fAllowedCharsSet->clone();
|
||
fAllowedLocales = uprv_strdup(src.fAllowedLocales);
|
||
if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
|
||
status = U_MEMORY_ALLOCATION_ERROR;
|
||
}
|
||
fRestrictionLevel = src.fRestrictionLevel;
|
||
}
|
||
|
||
SpoofImpl::~SpoofImpl() {
|
||
if (fSpoofData != NULL) {
|
||
fSpoofData->removeReference(); // Will delete if refCount goes to zero.
|
||
}
|
||
delete fAllowedCharsSet;
|
||
uprv_free((void *)fAllowedLocales);
|
||
}
|
||
|
||
// Cast this instance as a USpoofChecker for the C API.
|
||
USpoofChecker *SpoofImpl::asUSpoofChecker() {
|
||
return exportForC();
|
||
}
|
||
|
||
//
|
||
// Incoming parameter check on Status and the SpoofChecker object
|
||
// received from the C API.
|
||
//
|
||
const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
|
||
auto* This = validate(sc, status);
|
||
if (U_FAILURE(status)) {
|
||
return NULL;
|
||
}
|
||
if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) {
|
||
return NULL;
|
||
}
|
||
return This;
|
||
}
|
||
|
||
SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
|
||
return const_cast<SpoofImpl *>
|
||
(SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
|
||
}
|
||
|
||
|
||
void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
|
||
UnicodeSet allowedChars;
|
||
UnicodeSet *tmpSet = NULL;
|
||
const char *locStart = localesList;
|
||
const char *locEnd = NULL;
|
||
const char *localesListEnd = localesList + uprv_strlen(localesList);
|
||
int32_t localeListCount = 0; // Number of locales provided by caller.
|
||
|
||
// Loop runs once per locale from the localesList, a comma separated list of locales.
|
||
do {
|
||
locEnd = uprv_strchr(locStart, ',');
|
||
if (locEnd == NULL) {
|
||
locEnd = localesListEnd;
|
||
}
|
||
while (*locStart == ' ') {
|
||
locStart++;
|
||
}
|
||
const char *trimmedEnd = locEnd-1;
|
||
while (trimmedEnd > locStart && *trimmedEnd == ' ') {
|
||
trimmedEnd--;
|
||
}
|
||
if (trimmedEnd <= locStart) {
|
||
break;
|
||
}
|
||
const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
|
||
localeListCount++;
|
||
|
||
// We have one locale from the locales list.
|
||
// Add the script chars for this locale to the accumulating set of allowed chars.
|
||
// If the locale is no good, we will be notified back via status.
|
||
addScriptChars(locale, &allowedChars, status);
|
||
uprv_free((void *)locale);
|
||
if (U_FAILURE(status)) {
|
||
break;
|
||
}
|
||
locStart = locEnd + 1;
|
||
} while (locStart < localesListEnd);
|
||
|
||
// If our caller provided an empty list of locales, we disable the allowed characters checking
|
||
if (localeListCount == 0) {
|
||
uprv_free((void *)fAllowedLocales);
|
||
fAllowedLocales = uprv_strdup("");
|
||
tmpSet = new UnicodeSet(0, 0x10ffff);
|
||
if (fAllowedLocales == NULL || tmpSet == NULL) {
|
||
status = U_MEMORY_ALLOCATION_ERROR;
|
||
return;
|
||
}
|
||
tmpSet->freeze();
|
||
delete fAllowedCharsSet;
|
||
fAllowedCharsSet = tmpSet;
|
||
fChecks &= ~USPOOF_CHAR_LIMIT;
|
||
return;
|
||
}
|
||
|
||
|
||
// Add all common and inherited characters to the set of allowed chars.
|
||
UnicodeSet tempSet;
|
||
tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
|
||
allowedChars.addAll(tempSet);
|
||
tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
|
||
allowedChars.addAll(tempSet);
|
||
|
||
// If anything went wrong, we bail out without changing
|
||
// the state of the spoof checker.
|
||
if (U_FAILURE(status)) {
|
||
return;
|
||
}
|
||
|
||
// Store the updated spoof checker state.
|
||
tmpSet = allowedChars.clone();
|
||
const char *tmpLocalesList = uprv_strdup(localesList);
|
||
if (tmpSet == NULL || tmpLocalesList == NULL) {
|
||
status = U_MEMORY_ALLOCATION_ERROR;
|
||
return;
|
||
}
|
||
uprv_free((void *)fAllowedLocales);
|
||
fAllowedLocales = tmpLocalesList;
|
||
tmpSet->freeze();
|
||
delete fAllowedCharsSet;
|
||
fAllowedCharsSet = tmpSet;
|
||
fChecks |= USPOOF_CHAR_LIMIT;
|
||
}
|
||
|
||
|
||
const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
|
||
return fAllowedLocales;
|
||
}
|
||
|
||
|
||
// Given a locale (a language), add all the characters from all of the scripts used with that language
|
||
// to the allowedChars UnicodeSet
|
||
|
||
void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
|
||
UScriptCode scripts[30];
|
||
|
||
int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);
|
||
if (U_FAILURE(status)) {
|
||
return;
|
||
}
|
||
if (status == U_USING_DEFAULT_WARNING) {
|
||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||
return;
|
||
}
|
||
UnicodeSet tmpSet;
|
||
int32_t i;
|
||
for (i=0; i<numScripts; i++) {
|
||
tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
|
||
allowedChars->addAll(tmpSet);
|
||
}
|
||
}
|
||
|
||
// Computes the augmented script set for a code point, according to UTS 39 section 5.1.
|
||
void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
|
||
result.resetAll();
|
||
result.setScriptExtensions(codePoint, status);
|
||
if (U_FAILURE(status)) { return; }
|
||
|
||
// Section 5.1 step 1
|
||
if (result.test(USCRIPT_HAN, status)) {
|
||
result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
|
||
result.set(USCRIPT_JAPANESE, status);
|
||
result.set(USCRIPT_KOREAN, status);
|
||
}
|
||
if (result.test(USCRIPT_HIRAGANA, status)) {
|
||
result.set(USCRIPT_JAPANESE, status);
|
||
}
|
||
if (result.test(USCRIPT_KATAKANA, status)) {
|
||
result.set(USCRIPT_JAPANESE, status);
|
||
}
|
||
if (result.test(USCRIPT_HANGUL, status)) {
|
||
result.set(USCRIPT_KOREAN, status);
|
||
}
|
||
if (result.test(USCRIPT_BOPOMOFO, status)) {
|
||
result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
|
||
}
|
||
|
||
// Section 5.1 step 2
|
||
if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
|
||
result.setAll();
|
||
}
|
||
}
|
||
|
||
// Computes the resolved script set for a string, according to UTS 39 section 5.1.
|
||
void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
|
||
getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
|
||
}
|
||
|
||
// Computes the resolved script set for a string, omitting characters having the specified script.
|
||
// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
|
||
void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
|
||
result.setAll();
|
||
|
||
ScriptSet temp;
|
||
UChar32 codePoint;
|
||
for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
|
||
codePoint = input.char32At(i);
|
||
|
||
// Compute the augmented script set for the character
|
||
getAugmentedScriptSet(codePoint, temp, status);
|
||
if (U_FAILURE(status)) { return; }
|
||
|
||
// Intersect the augmented script set with the resolved script set, but only if the character doesn't
|
||
// have the script specified in the function call
|
||
if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
|
||
result.intersect(temp);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Computes the set of numerics for a string, according to UTS 39 section 5.3.
|
||
void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
|
||
result.clear();
|
||
|
||
UChar32 codePoint;
|
||
for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
|
||
codePoint = input.char32At(i);
|
||
|
||
// Store a representative character for each kind of decimal digit
|
||
if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
|
||
// Store the zero character as a representative for comparison.
|
||
// Unicode guarantees it is codePoint - value
|
||
result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
|
||
}
|
||
}
|
||
}
|
||
|
||
// Computes the restriction level of a string, according to UTS 39 section 5.2.
|
||
URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
|
||
// Section 5.2 step 1:
|
||
if (!fAllowedCharsSet->containsAll(input)) {
|
||
return USPOOF_UNRESTRICTIVE;
|
||
}
|
||
|
||
// Section 5.2 step 2
|
||
// Java use a static UnicodeSet for this test. In C++, avoid the static variable
|
||
// and just do a simple for loop.
|
||
UBool allASCII = TRUE;
|
||
for (int32_t i=0, length=input.length(); i<length; i++) {
|
||
if (input.charAt(i) > 0x7f) {
|
||
allASCII = FALSE;
|
||
break;
|
||
}
|
||
}
|
||
if (allASCII) {
|
||
return USPOOF_ASCII;
|
||
}
|
||
|
||
// Section 5.2 steps 3:
|
||
ScriptSet resolvedScriptSet;
|
||
getResolvedScriptSet(input, resolvedScriptSet, status);
|
||
if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
|
||
|
||
// Section 5.2 step 4:
|
||
if (!resolvedScriptSet.isEmpty()) {
|
||
return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
|
||
}
|
||
|
||
// Section 5.2 step 5:
|
||
ScriptSet resolvedNoLatn;
|
||
getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
|
||
if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
|
||
|
||
// Section 5.2 step 6:
|
||
if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
|
||
|| resolvedNoLatn.test(USCRIPT_JAPANESE, status)
|
||
|| resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
|
||
return USPOOF_HIGHLY_RESTRICTIVE;
|
||
}
|
||
|
||
// Section 5.2 step 7:
|
||
if (!resolvedNoLatn.isEmpty()
|
||
&& !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
|
||
&& !resolvedNoLatn.test(USCRIPT_GREEK, status)
|
||
&& !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
|
||
return USPOOF_MODERATELY_RESTRICTIVE;
|
||
}
|
||
|
||
// Section 5.2 step 8:
|
||
return USPOOF_MINIMALLY_RESTRICTIVE;
|
||
}
|
||
|
||
int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {
|
||
bool sawLeadCharacter = false;
|
||
for (int32_t i=0; i<input.length();) {
|
||
UChar32 cp = input.char32At(i);
|
||
if (sawLeadCharacter && cp == 0x0307) {
|
||
return i;
|
||
}
|
||
uint8_t combiningClass = u_getCombiningClass(cp);
|
||
// Skip over characters except for those with combining class 0 (non-combining characters) or with
|
||
// combining class 230 (same class as U+0307)
|
||
U_ASSERT(u_getCombiningClass(0x0307) == 230);
|
||
if (combiningClass == 0 || combiningClass == 230) {
|
||
sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp);
|
||
}
|
||
i += U16_LENGTH(cp);
|
||
}
|
||
return -1;
|
||
}
|
||
|
||
static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) {
|
||
return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' ||
|
||
u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED);
|
||
}
|
||
|
||
bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const {
|
||
if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
|
||
return true;
|
||
}
|
||
UnicodeString skelStr;
|
||
fSpoofData->confusableLookup(cp, skelStr);
|
||
UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1));
|
||
if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
|
||
return true;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
|
||
|
||
// Convert a text format hex number. Utility function used by builder code. Static.
|
||
// Input: UChar *string text. Output: a UChar32
|
||
// Input has been pre-checked, and will have no non-hex chars.
|
||
// The number must fall in the code point range of 0..0x10ffff
|
||
// Static Function.
|
||
UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
|
||
if (U_FAILURE(status)) {
|
||
return 0;
|
||
}
|
||
U_ASSERT(limit-start > 0);
|
||
uint32_t val = 0;
|
||
int i;
|
||
for (i=start; i<limit; i++) {
|
||
int digitVal = s[i] - 0x30;
|
||
if (digitVal>9) {
|
||
digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A'
|
||
}
|
||
if (digitVal>15) {
|
||
digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a'
|
||
}
|
||
U_ASSERT(digitVal <= 0xf);
|
||
val <<= 4;
|
||
val += digitVal;
|
||
}
|
||
if (val > 0x10ffff) {
|
||
status = U_PARSE_ERROR;
|
||
val = 0;
|
||
}
|
||
return (UChar32)val;
|
||
}
|
||
|
||
|
||
//-----------------------------------------
|
||
//
|
||
// class CheckResult Implementation
|
||
//
|
||
//-----------------------------------------
|
||
|
||
CheckResult::CheckResult() {
|
||
clear();
|
||
}
|
||
|
||
USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
|
||
return exportForC();
|
||
}
|
||
|
||
//
|
||
// Incoming parameter check on Status and the CheckResult object
|
||
// received from the C API.
|
||
//
|
||
const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
|
||
return validate(ptr, status);
|
||
}
|
||
|
||
CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
|
||
return validate(ptr, status);
|
||
}
|
||
|
||
void CheckResult::clear() {
|
||
fChecks = 0;
|
||
fNumerics.clear();
|
||
fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
|
||
}
|
||
|
||
int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
|
||
if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
|
||
return fChecks | fRestrictionLevel;
|
||
} else {
|
||
return fChecks;
|
||
}
|
||
}
|
||
|
||
CheckResult::~CheckResult() {
|
||
}
|
||
|
||
//----------------------------------------------------------------------------------------------
|
||
//
|
||
// class SpoofData Implementation
|
||
//
|
||
//----------------------------------------------------------------------------------------------
|
||
|
||
|
||
UBool SpoofData::validateDataVersion(UErrorCode &status) const {
|
||
if (U_FAILURE(status) ||
|
||
fRawData == NULL ||
|
||
fRawData->fMagic != USPOOF_MAGIC ||
|
||
fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
|
||
fRawData->fFormatVersion[1] != 0 ||
|
||
fRawData->fFormatVersion[2] != 0 ||
|
||
fRawData->fFormatVersion[3] != 0) {
|
||
status = U_INVALID_FORMAT_ERROR;
|
||
return FALSE;
|
||
}
|
||
return TRUE;
|
||
}
|
||
|
||
static UBool U_CALLCONV
|
||
spoofDataIsAcceptable(void *context,
|
||
const char * /* type */, const char * /*name*/,
|
||
const UDataInfo *pInfo) {
|
||
if(
|
||
pInfo->size >= 20 &&
|
||
pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
|
||
pInfo->charsetFamily == U_CHARSET_FAMILY &&
|
||
pInfo->dataFormat[0] == 0x43 && // dataFormat="Cfu "
|
||
pInfo->dataFormat[1] == 0x66 &&
|
||
pInfo->dataFormat[2] == 0x75 &&
|
||
pInfo->dataFormat[3] == 0x20 &&
|
||
pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
|
||
) {
|
||
UVersionInfo *version = static_cast<UVersionInfo *>(context);
|
||
if(version != NULL) {
|
||
uprv_memcpy(version, pInfo->dataVersion, 4);
|
||
}
|
||
return TRUE;
|
||
} else {
|
||
return FALSE;
|
||
}
|
||
}
|
||
|
||
// Methods for the loading of the default confusables data file. The confusable
|
||
// data is loaded only when it is needed.
|
||
//
|
||
// SpoofData::getDefault() - Return the default confusables data, and call the
|
||
// initOnce() if it is not available. Adds a reference
|
||
// to the SpoofData that the caller is responsible for
|
||
// decrementing when they are done with the data.
|
||
//
|
||
// uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData
|
||
// is shared by all spoof checkers using the default data.
|
||
//
|
||
// uspoof_cleanupDefaultData - Called during cleanup.
|
||
//
|
||
|
||
static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
|
||
static SpoofData* gDefaultSpoofData;
|
||
|
||
static UBool U_CALLCONV
|
||
uspoof_cleanupDefaultData(void) {
|
||
if (gDefaultSpoofData) {
|
||
// Will delete, assuming all user-level spoof checkers were closed.
|
||
gDefaultSpoofData->removeReference();
|
||
gDefaultSpoofData = nullptr;
|
||
gSpoofInitDefaultOnce.reset();
|
||
}
|
||
return TRUE;
|
||
}
|
||
|
||
static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
|
||
UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables",
|
||
spoofDataIsAcceptable,
|
||
nullptr, // context, would receive dataVersion if supplied.
|
||
&status);
|
||
if (U_FAILURE(status)) { return; }
|
||
gDefaultSpoofData = new SpoofData(udm, status);
|
||
if (U_FAILURE(status)) {
|
||
delete gDefaultSpoofData;
|
||
gDefaultSpoofData = nullptr;
|
||
return;
|
||
}
|
||
if (gDefaultSpoofData == nullptr) {
|
||
status = U_MEMORY_ALLOCATION_ERROR;
|
||
return;
|
||
}
|
||
ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
|
||
}
|
||
|
||
SpoofData* SpoofData::getDefault(UErrorCode& status) {
|
||
umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
|
||
if (U_FAILURE(status)) { return NULL; }
|
||
gDefaultSpoofData->addReference();
|
||
return gDefaultSpoofData;
|
||
}
|
||
|
||
|
||
|
||
SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
|
||
{
|
||
reset();
|
||
if (U_FAILURE(status)) {
|
||
return;
|
||
}
|
||
fUDM = udm;
|
||
// fRawData is non-const because it may be constructed by the data builder.
|
||
fRawData = reinterpret_cast<SpoofDataHeader *>(
|
||
const_cast<void *>(udata_getMemory(udm)));
|
||
validateDataVersion(status);
|
||
initPtrs(status);
|
||
}
|
||
|
||
|
||
SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
|
||
{
|
||
reset();
|
||
if (U_FAILURE(status)) {
|
||
return;
|
||
}
|
||
if ((size_t)length < sizeof(SpoofDataHeader)) {
|
||
status = U_INVALID_FORMAT_ERROR;
|
||
return;
|
||
}
|
||
if (data == NULL) {
|
||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||
return;
|
||
}
|
||
void *ncData = const_cast<void *>(data);
|
||
fRawData = static_cast<SpoofDataHeader *>(ncData);
|
||
if (length < fRawData->fLength) {
|
||
status = U_INVALID_FORMAT_ERROR;
|
||
return;
|
||
}
|
||
validateDataVersion(status);
|
||
initPtrs(status);
|
||
}
|
||
|
||
|
||
// Spoof Data constructor for use from data builder.
|
||
// Initializes a new, empty data area that will be populated later.
|
||
SpoofData::SpoofData(UErrorCode &status) {
|
||
reset();
|
||
if (U_FAILURE(status)) {
|
||
return;
|
||
}
|
||
fDataOwned = true;
|
||
|
||
// The spoof header should already be sized to be a multiple of 16 bytes.
|
||
// Just in case it's not, round it up.
|
||
uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
|
||
U_ASSERT(initialSize == sizeof(SpoofDataHeader));
|
||
|
||
fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
|
||
fMemLimit = initialSize;
|
||
if (fRawData == NULL) {
|
||
status = U_MEMORY_ALLOCATION_ERROR;
|
||
return;
|
||
}
|
||
uprv_memset(fRawData, 0, initialSize);
|
||
|
||
fRawData->fMagic = USPOOF_MAGIC;
|
||
fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
|
||
fRawData->fFormatVersion[1] = 0;
|
||
fRawData->fFormatVersion[2] = 0;
|
||
fRawData->fFormatVersion[3] = 0;
|
||
initPtrs(status);
|
||
}
|
||
|
||
// reset() - initialize all fields.
|
||
// Should be updated if any new fields are added.
|
||
// Called by constructors to put things in a known initial state.
|
||
void SpoofData::reset() {
|
||
fRawData = NULL;
|
||
fDataOwned = FALSE;
|
||
fUDM = NULL;
|
||
fMemLimit = 0;
|
||
fRefCount = 1;
|
||
fCFUKeys = NULL;
|
||
fCFUValues = NULL;
|
||
fCFUStrings = NULL;
|
||
}
|
||
|
||
|
||
// SpoofData::initPtrs()
|
||
// Initialize the pointers to the various sections of the raw data.
|
||
//
|
||
// This function is used both during the Trie building process (multiple
|
||
// times, as the individual data sections are added), and
|
||
// during the opening of a Spoof Checker from prebuilt data.
|
||
//
|
||
// The pointers for non-existent data sections (identified by an offset of 0)
|
||
// are set to NULL.
|
||
//
|
||
// Note: During building the data, adding each new data section
|
||
// reallocs the raw data area, which likely relocates it, which
|
||
// in turn requires reinitializing all of the pointers into it, hence
|
||
// multiple calls to this function during building.
|
||
//
|
||
void SpoofData::initPtrs(UErrorCode &status) {
|
||
fCFUKeys = NULL;
|
||
fCFUValues = NULL;
|
||
fCFUStrings = NULL;
|
||
if (U_FAILURE(status)) {
|
||
return;
|
||
}
|
||
if (fRawData->fCFUKeys != 0) {
|
||
fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
|
||
}
|
||
if (fRawData->fCFUStringIndex != 0) {
|
||
fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
|
||
}
|
||
if (fRawData->fCFUStringTable != 0) {
|
||
fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
|
||
}
|
||
}
|
||
|
||
|
||
SpoofData::~SpoofData() {
|
||
if (fDataOwned) {
|
||
uprv_free(fRawData);
|
||
}
|
||
fRawData = NULL;
|
||
if (fUDM != NULL) {
|
||
udata_close(fUDM);
|
||
}
|
||
fUDM = NULL;
|
||
}
|
||
|
||
|
||
void SpoofData::removeReference() {
|
||
if (umtx_atomic_dec(&fRefCount) == 0) {
|
||
delete this;
|
||
}
|
||
}
|
||
|
||
|
||
SpoofData *SpoofData::addReference() {
|
||
umtx_atomic_inc(&fRefCount);
|
||
return this;
|
||
}
|
||
|
||
|
||
void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
|
||
if (U_FAILURE(status)) {
|
||
return NULL;
|
||
}
|
||
if (!fDataOwned) {
|
||
UPRV_UNREACHABLE_EXIT;
|
||
}
|
||
|
||
numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16
|
||
uint32_t returnOffset = fMemLimit;
|
||
fMemLimit += numBytes;
|
||
fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
|
||
fRawData->fLength = fMemLimit;
|
||
uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
|
||
initPtrs(status);
|
||
return (char *)fRawData + returnOffset;
|
||
}
|
||
|
||
int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
|
||
int32_t dataSize = fRawData->fLength;
|
||
if (capacity < dataSize) {
|
||
status = U_BUFFER_OVERFLOW_ERROR;
|
||
return dataSize;
|
||
}
|
||
uprv_memcpy(buf, fRawData, dataSize);
|
||
return dataSize;
|
||
}
|
||
|
||
int32_t SpoofData::size() const {
|
||
return fRawData->fLength;
|
||
}
|
||
|
||
//-------------------------------
|
||
//
|
||
// Front-end APIs for SpoofData
|
||
//
|
||
//-------------------------------
|
||
|
||
int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
|
||
// Perform a binary search.
|
||
// [lo, hi), i.e lo is inclusive, hi is exclusive.
|
||
// The result after the loop will be in lo.
|
||
int32_t lo = 0;
|
||
int32_t hi = length();
|
||
do {
|
||
int32_t mid = (lo + hi) / 2;
|
||
if (codePointAt(mid) > inChar) {
|
||
hi = mid;
|
||
} else if (codePointAt(mid) < inChar) {
|
||
lo = mid;
|
||
} else {
|
||
// Found result. Break early.
|
||
lo = mid;
|
||
break;
|
||
}
|
||
} while (hi - lo > 1);
|
||
|
||
// Did we find an entry? If not, the char maps to itself.
|
||
if (codePointAt(lo) != inChar) {
|
||
dest.append(inChar);
|
||
return 1;
|
||
}
|
||
|
||
// Add the element to the string builder and return.
|
||
return appendValueTo(lo, dest);
|
||
}
|
||
|
||
int32_t SpoofData::length() const {
|
||
return fRawData->fCFUKeysSize;
|
||
}
|
||
|
||
UChar32 SpoofData::codePointAt(int32_t index) const {
|
||
return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
|
||
}
|
||
|
||
int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
|
||
int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
|
||
|
||
// Value is either a char (for strings of length 1) or
|
||
// an index into the string table (for longer strings)
|
||
uint16_t value = fCFUValues[index];
|
||
if (stringLength == 1) {
|
||
dest.append((UChar)value);
|
||
} else {
|
||
dest.append(fCFUStrings + value, stringLength);
|
||
}
|
||
|
||
return stringLength;
|
||
}
|
||
|
||
|
||
U_NAMESPACE_END
|
||
|
||
U_NAMESPACE_USE
|
||
|
||
//-----------------------------------------------------------------------------
|
||
//
|
||
// uspoof_swap - byte swap and char encoding swap of spoof data
|
||
//
|
||
//-----------------------------------------------------------------------------
|
||
U_CAPI int32_t U_EXPORT2
|
||
uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
|
||
UErrorCode *status) {
|
||
|
||
if (status == NULL || U_FAILURE(*status)) {
|
||
return 0;
|
||
}
|
||
if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
|
||
*status=U_ILLEGAL_ARGUMENT_ERROR;
|
||
return 0;
|
||
}
|
||
|
||
//
|
||
// Check that the data header is for spoof data.
|
||
// (Header contents are defined in gencfu.cpp)
|
||
//
|
||
const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
|
||
if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */
|
||
pInfo->dataFormat[1]==0x66 &&
|
||
pInfo->dataFormat[2]==0x75 &&
|
||
pInfo->dataFormat[3]==0x20 &&
|
||
pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
|
||
pInfo->formatVersion[1]==0 &&
|
||
pInfo->formatVersion[2]==0 &&
|
||
pInfo->formatVersion[3]==0 )) {
|
||
udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
|
||
"(format version %02x %02x %02x %02x) is not recognized\n",
|
||
pInfo->dataFormat[0], pInfo->dataFormat[1],
|
||
pInfo->dataFormat[2], pInfo->dataFormat[3],
|
||
pInfo->formatVersion[0], pInfo->formatVersion[1],
|
||
pInfo->formatVersion[2], pInfo->formatVersion[3]);
|
||
*status=U_UNSUPPORTED_ERROR;
|
||
return 0;
|
||
}
|
||
|
||
//
|
||
// Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
|
||
// header). This swap also conveniently gets us
|
||
// the size of the ICU d.h., which lets us locate the start
|
||
// of the uspoof specific data.
|
||
//
|
||
int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
|
||
|
||
|
||
//
|
||
// Get the Spoof Data Header, and check that it appears to be OK.
|
||
//
|
||
//
|
||
const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
|
||
SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
|
||
if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC ||
|
||
ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader))
|
||
{
|
||
udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
|
||
*status=U_UNSUPPORTED_ERROR;
|
||
return 0;
|
||
}
|
||
|
||
//
|
||
// Prefight operation? Just return the size
|
||
//
|
||
int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
|
||
int32_t totalSize = headerSize + spoofDataLength;
|
||
if (length < 0) {
|
||
return totalSize;
|
||
}
|
||
|
||
//
|
||
// Check that length passed in is consistent with length from Spoof data header.
|
||
//
|
||
if (length < totalSize) {
|
||
udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
|
||
spoofDataLength);
|
||
*status=U_INDEX_OUTOFBOUNDS_ERROR;
|
||
return 0;
|
||
}
|
||
|
||
|
||
//
|
||
// Swap the Data. Do the data itself first, then the Spoof Data Header, because
|
||
// we need to reference the header to locate the data, and an
|
||
// inplace swap of the header leaves it unusable.
|
||
//
|
||
uint8_t *outBytes = (uint8_t *)outData + headerSize;
|
||
SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes;
|
||
|
||
int32_t sectionStart;
|
||
int32_t sectionLength;
|
||
|
||
//
|
||
// If not swapping in place, zero out the output buffer before starting.
|
||
// Gaps may exist between the individual sections, and these must be zeroed in
|
||
// the output buffer. The simplest way to do that is to just zero the whole thing.
|
||
//
|
||
if (inBytes != outBytes) {
|
||
uprv_memset(outBytes, 0, spoofDataLength);
|
||
}
|
||
|
||
// Confusables Keys Section (fCFUKeys)
|
||
sectionStart = ds->readUInt32(spoofDH->fCFUKeys);
|
||
sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
|
||
ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
||
|
||
// String Index Section
|
||
sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex);
|
||
sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
|
||
ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
||
|
||
// String Table Section
|
||
sectionStart = ds->readUInt32(spoofDH->fCFUStringTable);
|
||
sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
|
||
ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
||
|
||
// And, last, swap the header itself.
|
||
// int32_t fMagic // swap this
|
||
// uint8_t fFormatVersion[4] // Do not swap this, just copy
|
||
// int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff.
|
||
//
|
||
uint32_t magic = ds->readUInt32(spoofDH->fMagic);
|
||
ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
|
||
|
||
if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
|
||
uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
|
||
}
|
||
// swap starting at fLength
|
||
ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
|
||
|
||
return totalSize;
|
||
}
|
||
|
||
#endif
|
||
|
||
|