Path: blob/master/thirdparty/pcre2/src/pcre2_script_run.c
21421 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8Original API code Copyright (c) 1997-2012 University of Cambridge9New API code Copyright (c) 2016-2021 University of Cambridge1011-----------------------------------------------------------------------------12Redistribution and use in source and binary forms, with or without13modification, are permitted provided that the following conditions are met:1415* Redistributions of source code must retain the above copyright notice,16this list of conditions and the following disclaimer.1718* Redistributions in binary form must reproduce the above copyright19notice, this list of conditions and the following disclaimer in the20documentation and/or other materials provided with the distribution.2122* Neither the name of the University of Cambridge nor the names of its23contributors may be used to endorse or promote products derived from24this software without specific prior written permission.2526THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE36POSSIBILITY OF SUCH DAMAGE.37-----------------------------------------------------------------------------38*/394041/* This module contains the function for checking a script run. */424344#include "pcre2_internal.h"45464748/*************************************************49* Check script run *50*************************************************/5152/* A script run is conceptually a sequence of characters all in the same53Unicode script. However, it isn't quite that simple. There are special rules54for scripts that are commonly used together, and also special rules for digits.55This function implements the appropriate checks, which is possible only when56PCRE2 is compiled with Unicode support. The function returns TRUE if there is57no Unicode support; however, it should never be called in that circumstance58because an error is given by pcre2_compile() if a script run is called for in a59version of PCRE2 compiled without Unicode support.6061Arguments:62pgr point to the first character63endptr point after the last character64utf TRUE if in UTF mode6566Returns: TRUE if this is a valid script run67*/6869/* These are states in the checking process. */7071enum { SCRIPT_UNSET, /* Requirement as yet unknown */72SCRIPT_MAP, /* Bitmap contains acceptable scripts */73SCRIPT_HANPENDING, /* Have had only Han characters */74SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */75SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */76SCRIPT_HANHANGUL /* Expect Han or Hangul */77};7879#define UCD_MAPSIZE (ucp_Unknown/32 + 1)80#define FULL_MAPSIZE (ucp_Script_Count/32 + 1)8182BOOL83PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)84{85#ifdef SUPPORT_UNICODE86uint32_t require_state = SCRIPT_UNSET;87uint32_t require_map[FULL_MAPSIZE];88uint32_t map[FULL_MAPSIZE];89uint32_t require_digitset = 0;90uint32_t c;9192#if PCRE2_CODE_UNIT_WIDTH == 3293(void)utf; /* Avoid compiler warning */94#endif9596/* Any string containing fewer than 2 characters is a valid script run. */9798if (ptr >= endptr) return TRUE;99GETCHARINCTEST(c, ptr);100if (ptr >= endptr) return TRUE;101102/* Initialize the require map. This is a full-size bitmap that has a bit for103every script, as opposed to the maps in ucd_script_sets, which only have bits104for scripts less than ucp_Unknown - those that appear in script extension105lists. */106107for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;108109/* Scan strings of two or more characters, checking the Unicode characteristics110of each code point. There is special code for scripts that can be combined with111characters from the Han Chinese script. This may be used in conjunction with112four other scripts in these combinations:113114. Han with Hiragana and Katakana is allowed (for Japanese).115. Han with Bopomofo is allowed (for Taiwanese Mandarin).116. Han with Hangul is allowed (for Korean).117118If the first significant character's script is one of the four, the required119script type is immediately known. However, if the first significant120character's script is Han, we have to keep checking for a non-Han character.121Hence the SCRIPT_HANPENDING state. */122123for (;;)124{125const ucd_record *ucd = GET_UCD(c);126uint32_t script = ucd->script;127128/* If the script is Unknown, the string is not a valid script run. Such129characters can only form script runs of length one (see test above). */130131if (script == ucp_Unknown) return FALSE;132133/* A character without any script extensions whose script is Inherited or134Common is always accepted with any script. If there are extensions, the135following processing happens for all scripts. */136137if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common))138{139BOOL OK;140141/* Set up a full-sized map for this character that can include bits for all142scripts. Copy the scriptx map for this character (which covers those143scripts that appear in script extension lists), set the remaining values to144zero, and then, except for Common or Inherited, add this script's bit to145the map. */146147memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));148memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));149if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);150151/* Handle the different checking states */152153switch(require_state)154{155/* First significant character - it might follow Common or Inherited156characters that do not have any script extensions. */157158case SCRIPT_UNSET:159switch(script)160{161case ucp_Han:162require_state = SCRIPT_HANPENDING;163break;164165case ucp_Hiragana:166case ucp_Katakana:167require_state = SCRIPT_HANHIRAKATA;168break;169170case ucp_Bopomofo:171require_state = SCRIPT_HANBOPOMOFO;172break;173174case ucp_Hangul:175require_state = SCRIPT_HANHANGUL;176break;177178default:179memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));180require_state = SCRIPT_MAP;181break;182}183break;184185/* The first significant character was Han. An inspection of the Unicode18611.0.0 files shows that there are the following types of Script Extension187list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul188scripts:189190. Bopomofo + Han191. Han + Hiragana + Katakana192. Hiragana + Katakana193. Bopopmofo + Hangul + Han + Hiragana + Katakana194195The following code tries to make sense of this. */196197#define FOUND_BOPOMOFO 1198#define FOUND_HIRAGANA 2199#define FOUND_KATAKANA 4200#define FOUND_HANGUL 8201202case SCRIPT_HANPENDING:203if (script != ucp_Han) /* Another Han does nothing */204{205uint32_t chspecial = 0;206207if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;208if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;209if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;210if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL;211212if (chspecial == 0) return FALSE; /* Not allowed with Han */213214if (chspecial == FOUND_BOPOMOFO)215require_state = SCRIPT_HANBOPOMOFO;216else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))217require_state = SCRIPT_HANHIRAKATA;218219/* Otherwise this character must be allowed with all of them, so remain220in the pending state. */221}222break;223224/* Previously encountered one of the "with Han" scripts. Check that225this character is appropriate. */226227case SCRIPT_HANHIRAKATA:228if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +229MAPBIT(map, ucp_Katakana) == 0) return FALSE;230break;231232case SCRIPT_HANBOPOMOFO:233if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;234break;235236case SCRIPT_HANHANGUL:237if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;238break;239240/* Previously encountered one or more characters that are allowed with a241list of scripts. */242243case SCRIPT_MAP:244OK = FALSE;245246for (int i = 0; i < FULL_MAPSIZE; i++)247{248if ((require_map[i] & map[i]) != 0)249{250OK = TRUE;251break;252}253}254255if (!OK) return FALSE;256257/* The rest of the string must be in this script, but we have to258allow for the Han complications. */259260switch(script)261{262case ucp_Han:263require_state = SCRIPT_HANPENDING;264break;265266case ucp_Hiragana:267case ucp_Katakana:268require_state = SCRIPT_HANHIRAKATA;269break;270271case ucp_Bopomofo:272require_state = SCRIPT_HANBOPOMOFO;273break;274275case ucp_Hangul:276require_state = SCRIPT_HANHANGUL;277break;278279/* Compute the intersection of the required list of scripts and the280allowed scripts for this character. */281282default:283for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];284break;285}286287break;288}289} /* End checking character's script and extensions. */290291/* The character is in an acceptable script. We must now ensure that all292decimal digits in the string come from the same set. Some scripts (e.g.293Common, Arabic) have more than one set of decimal digits. This code does294not allow mixing sets, even within the same script. The vector called295PRIV(ucd_digit_sets)[] contains, in its first element, the number of296following elements, and then, in ascending order, the code points of the297'9' characters in every set of 10 digits. Each set is identified by the298offset in the vector of its '9' character. An initial check of the first299value picks up ASCII digits quickly. Otherwise, a binary chop is used. */300301if (ucd->chartype == ucp_Nd)302{303uint32_t digitset;304305if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else306{307int mid;308int bot = 1;309int top = PRIV(ucd_digit_sets)[0];310for (;;)311{312if (top <= bot + 1) /* <= rather than == is paranoia */313{314digitset = top;315break;316}317mid = (top + bot) / 2;318if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;319}320}321322/* A required value of 0 means "unset". */323324if (require_digitset == 0) require_digitset = digitset;325else if (digitset != require_digitset) return FALSE;326} /* End digit handling */327328/* If we haven't yet got to the end, pick up the next character. */329330if (ptr >= endptr) return TRUE;331GETCHARINCTEST(c, ptr);332} /* End checking loop */333334#else /* NOT SUPPORT_UNICODE */335(void)ptr;336(void)endptr;337(void)utf;338return TRUE;339#endif /* SUPPORT_UNICODE */340}341342/* End of pcre2_script_run.c */343344345