Path: blob/master/thirdparty/pcre2/src/pcre2_convert.c
21731 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8Original API code Copyright (c) 1997-2012 University of Cambridge9New API code Copyright (c) 2016-2024 University of Cambridge1011-----------------------------------------------------------------------------12Redistribution and use in source and binary forms, with or without13modification, are permitted provided that the following conditions are met:1415* Redistributions of source code must retain the above copyright notice,16this list of conditions and the following disclaimer.1718* Redistributions in binary form must reproduce the above copyright19notice, this list of conditions and the following disclaimer in the20documentation and/or other materials provided with the distribution.2122* Neither the name of the University of Cambridge nor the names of its23contributors may be used to endorse or promote products derived from24this software without specific prior written permission.2526THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE36POSSIBILITY OF SUCH DAMAGE.37-----------------------------------------------------------------------------38*/394041#include "pcre2_internal.h"42434445#define TYPE_OPTIONS (PCRE2_CONVERT_GLOB| \46PCRE2_CONVERT_POSIX_BASIC|PCRE2_CONVERT_POSIX_EXTENDED)4748#define ALL_OPTIONS (PCRE2_CONVERT_UTF|PCRE2_CONVERT_NO_UTF_CHECK| \49PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR| \50PCRE2_CONVERT_GLOB_NO_STARSTAR| \51TYPE_OPTIONS)5253#define DUMMY_BUFFER_SIZE 1005455/* Generated pattern fragments */5657#define STR_BACKSLASH_A STR_BACKSLASH STR_A58#define STR_BACKSLASH_z STR_BACKSLASH STR_z59#define STR_COLON_RIGHT_SQUARE_BRACKET STR_COLON STR_RIGHT_SQUARE_BRACKET60#define STR_DOT_STAR_LOOKBEHIND STR_DOT STR_ASTERISK STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_LESS_THAN_SIGN STR_EQUALS_SIGN61#define STR_LOOKAHEAD_NOT_DOT STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_EXCLAMATION_MARK STR_BACKSLASH STR_DOT STR_RIGHT_PARENTHESIS62#define STR_QUERY_s STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_s STR_RIGHT_PARENTHESIS63#define STR_STAR_NUL STR_LEFT_PARENTHESIS STR_ASTERISK STR_N STR_U STR_L STR_RIGHT_PARENTHESIS6465/* States for POSIX processing */6667enum { POSIX_START_REGEX, POSIX_ANCHORED, POSIX_NOT_BRACKET,68POSIX_CLASS_NOT_STARTED, POSIX_CLASS_STARTING, POSIX_CLASS_STARTED };6970/* Macro to add a character string to the output buffer, checking for overflow. */7172#define PUTCHARS(string) \73{ \74for (const char *s = string; *s != 0; s++) \75{ \76if (p >= endp) return PCRE2_ERROR_NOMEMORY; \77*p++ = *s; \78} \79}8081/* Macro to check for lowercase characters. */8283#ifdef EBCDIC84#define ISLOWER(c) (((c) >= CHAR_a && (c) <= CHAR_i) || \85((c) >= CHAR_j && (c) <= CHAR_r) || \86((c) >= CHAR_s && (c) <= CHAR_z))87#else88#define ISLOWER(c) ((c) >= CHAR_a && (c) <= CHAR_z)89#endif9091/* Literals that must be escaped: \ ? * + | . ^ $ { } [ ] ( ) */9293static const char *pcre2_escaped_literals =94STR_BACKSLASH STR_QUESTION_MARK STR_ASTERISK STR_PLUS95STR_VERTICAL_LINE STR_DOT STR_CIRCUMFLEX_ACCENT STR_DOLLAR_SIGN96STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET97STR_LEFT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET98STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS;99100/* Recognized escaped metacharacters in POSIX basic patterns. */101102static const char *posix_meta_escapes =103STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS104STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET105STR_1 STR_2 STR_3 STR_4 STR_5 STR_6 STR_7 STR_8 STR_9;106107/* Recognized POSIX classes, colon-separated. */108109static const char *posix_classes =110STR_a STR_l STR_p STR_h STR_a STR_COLON111STR_l STR_o STR_w STR_e STR_r STR_COLON112STR_u STR_p STR_p STR_e STR_r STR_COLON113STR_a STR_l STR_n STR_u STR_m STR_COLON114STR_a STR_s STR_c STR_i STR_i STR_COLON115STR_b STR_l STR_a STR_n STR_k STR_COLON116STR_c STR_n STR_t STR_r STR_l STR_COLON117STR_d STR_i STR_g STR_i STR_t STR_COLON118STR_g STR_r STR_a STR_p STR_h STR_COLON119STR_p STR_r STR_i STR_n STR_t STR_COLON120STR_p STR_u STR_n STR_c STR_t STR_COLON121STR_s STR_p STR_a STR_c STR_e STR_COLON122STR_w STR_o STR_r STR_d STR_COLON123STR_x STR_d STR_i STR_g STR_i STR_t STR_COLON;124125126127/*************************************************128* Convert a POSIX pattern *129*************************************************/130131/* This function handles both basic and extended POSIX patterns.132133Arguments:134pattype the pattern type135pattern the pattern136plength length in code units137utf TRUE if UTF138use_buffer where to put the output139use_length length of use_buffer140bufflenptr where to put the used length141dummyrun TRUE if a dummy run142ccontext the convert context143144Returns: 0 => success145!0 => error code146*/147148static int149convert_posix(uint32_t pattype, PCRE2_SPTR pattern, PCRE2_SIZE plength,150BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length,151PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext)152{153PCRE2_SPTR posix = pattern;154PCRE2_UCHAR *p = use_buffer;155PCRE2_UCHAR *pp = p;156PCRE2_UCHAR *endp = p + use_length - 1; /* Allow for trailing zero */157PCRE2_SIZE convlength = 0;158159uint32_t bracount = 0;160uint32_t posix_state = POSIX_START_REGEX;161uint32_t lastspecial = 0;162BOOL extended = (pattype & PCRE2_CONVERT_POSIX_EXTENDED) != 0;163BOOL nextisliteral = FALSE;164165(void)utf; /* Not used when Unicode not supported */166(void)ccontext; /* Not currently used */167168/* Initialize default for error offset as end of input. */169170*bufflenptr = plength;171PUTCHARS(STR_STAR_NUL);172173/* Now scan the input. */174175while (plength > 0)176{177uint32_t c, sc;178int clength = 1;179180/* Add in the length of the last item, then, if in the dummy run, pull the181pointer back to the start of the (temporary) buffer and then remember the182start of the next item. */183184convlength += p - pp;185if (dummyrun) p = use_buffer;186pp = p;187188/* Pick up the next character */189190#ifndef SUPPORT_UNICODE191c = *posix;192#else193GETCHARLENTEST(c, posix, clength);194#endif195posix += clength;196plength -= clength;197198sc = nextisliteral? 0 : c;199nextisliteral = FALSE;200201/* Handle a character within a class. */202203if (posix_state >= POSIX_CLASS_NOT_STARTED)204{205if (c == CHAR_RIGHT_SQUARE_BRACKET)206{207PUTCHARS(STR_RIGHT_SQUARE_BRACKET);208posix_state = POSIX_NOT_BRACKET;209}210211/* Not the end of the class */212213else214{215switch (posix_state)216{217case POSIX_CLASS_STARTED:218if (ISLOWER(c)) break; /* Remain in started state */219posix_state = POSIX_CLASS_NOT_STARTED;220if (c == CHAR_COLON && plength > 0 &&221*posix == CHAR_RIGHT_SQUARE_BRACKET)222{223PUTCHARS(STR_COLON_RIGHT_SQUARE_BRACKET);224plength--;225posix++;226continue; /* With next character after :] */227}228PCRE2_FALLTHROUGH /* Fall through */229230case POSIX_CLASS_NOT_STARTED:231if (c == CHAR_LEFT_SQUARE_BRACKET)232posix_state = POSIX_CLASS_STARTING;233break;234235case POSIX_CLASS_STARTING:236if (c == CHAR_COLON) posix_state = POSIX_CLASS_STARTED;237break;238}239240if (c == CHAR_BACKSLASH) PUTCHARS(STR_BACKSLASH);241if (p + clength > endp) return PCRE2_ERROR_NOMEMORY;242memcpy(p, posix - clength, CU2BYTES(clength));243p += clength;244}245}246247/* Handle a character not within a class. */248249else switch(sc)250{251case CHAR_LEFT_SQUARE_BRACKET:252PUTCHARS(STR_LEFT_SQUARE_BRACKET);253254#ifdef NEVER255/* We could handle special cases [[:<:]] and [[:>:]] (which PCRE does256support) but they are not part of POSIX 1003.1. */257258if (plength >= 6)259{260if (posix[0] == CHAR_LEFT_SQUARE_BRACKET &&261posix[1] == CHAR_COLON &&262(posix[2] == CHAR_LESS_THAN_SIGN ||263posix[2] == CHAR_GREATER_THAN_SIGN) &&264posix[3] == CHAR_COLON &&265posix[4] == CHAR_RIGHT_SQUARE_BRACKET &&266posix[5] == CHAR_RIGHT_SQUARE_BRACKET)267{268if (p + 6 > endp) return PCRE2_ERROR_NOMEMORY;269memcpy(p, posix, CU2BYTES(6));270p += 6;271posix += 6;272plength -= 6;273continue; /* With next character */274}275}276#endif277278/* Handle start of "normal" character classes */279280posix_state = POSIX_CLASS_NOT_STARTED;281282/* Handle ^ and ] as first characters */283284if (plength > 0)285{286if (*posix == CHAR_CIRCUMFLEX_ACCENT)287{288posix++;289plength--;290PUTCHARS(STR_CIRCUMFLEX_ACCENT);291}292if (plength > 0 && *posix == CHAR_RIGHT_SQUARE_BRACKET)293{294posix++;295plength--;296PUTCHARS(STR_RIGHT_SQUARE_BRACKET);297}298}299break;300301case CHAR_BACKSLASH:302if (plength == 0) return PCRE2_ERROR_END_BACKSLASH;303if (extended) nextisliteral = TRUE; else304{305if (*posix < 255 && strchr(posix_meta_escapes, *posix) != NULL)306{307if (*posix >= CHAR_0 && *posix <= CHAR_9) PUTCHARS(STR_BACKSLASH);308if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY;309lastspecial = *p++ = *posix++;310plength--;311}312else nextisliteral = TRUE;313}314break;315316case CHAR_RIGHT_PARENTHESIS:317if (!extended || bracount == 0) goto ESCAPE_LITERAL;318bracount--;319goto COPY_SPECIAL;320321case CHAR_LEFT_PARENTHESIS:322bracount++;323PCRE2_FALLTHROUGH /* Fall through */324325case CHAR_QUESTION_MARK:326case CHAR_PLUS:327case CHAR_LEFT_CURLY_BRACKET:328case CHAR_RIGHT_CURLY_BRACKET:329case CHAR_VERTICAL_LINE:330if (!extended) goto ESCAPE_LITERAL;331PCRE2_FALLTHROUGH /* Fall through */332333case CHAR_DOT:334case CHAR_DOLLAR_SIGN:335posix_state = POSIX_NOT_BRACKET;336COPY_SPECIAL:337lastspecial = c;338if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY;339*p++ = c;340break;341342case CHAR_ASTERISK:343if (lastspecial != CHAR_ASTERISK)344{345if (!extended && (posix_state < POSIX_NOT_BRACKET ||346lastspecial == CHAR_LEFT_PARENTHESIS))347goto ESCAPE_LITERAL;348goto COPY_SPECIAL;349}350break; /* Ignore second and subsequent asterisks */351352case CHAR_CIRCUMFLEX_ACCENT:353if (extended) goto COPY_SPECIAL;354if (posix_state == POSIX_START_REGEX ||355lastspecial == CHAR_LEFT_PARENTHESIS)356{357posix_state = POSIX_ANCHORED;358goto COPY_SPECIAL;359}360PCRE2_FALLTHROUGH /* Fall through */361362default:363if (c < 255 && strchr(pcre2_escaped_literals, c) != NULL)364{365ESCAPE_LITERAL:366PUTCHARS(STR_BACKSLASH);367}368lastspecial = 0xff; /* Indicates nothing special */369if (p + clength > endp) return PCRE2_ERROR_NOMEMORY;370memcpy(p, posix - clength, CU2BYTES(clength));371p += clength;372posix_state = POSIX_NOT_BRACKET;373break;374}375}376377if (posix_state >= POSIX_CLASS_NOT_STARTED)378return PCRE2_ERROR_MISSING_SQUARE_BRACKET;379convlength += p - pp; /* Final segment */380*bufflenptr = convlength;381*p++ = 0;382return 0;383}384385386/*************************************************387* Convert a glob pattern *388*************************************************/389390/* Context for writing the output into a buffer. */391392typedef struct pcre2_output_context {393PCRE2_UCHAR *output; /* current output position */394PCRE2_SPTR output_end; /* output end */395PCRE2_SIZE output_size; /* size of the output */396uint8_t out_str[8]; /* string copied to the output */397} pcre2_output_context;398399400/* Write a character into the output.401402Arguments:403out output context404chr the next character405*/406407static void408convert_glob_write(pcre2_output_context *out, PCRE2_UCHAR chr)409{410out->output_size++;411412if (out->output < out->output_end)413*out->output++ = chr;414}415416417/* Write a string into the output.418419Arguments:420out output context421length length of out->out_str422*/423424static void425convert_glob_write_str(pcre2_output_context *out, PCRE2_SIZE length)426{427uint8_t *out_str = out->out_str;428PCRE2_UCHAR *output = out->output;429PCRE2_SPTR output_end = out->output_end;430PCRE2_SIZE output_size = out->output_size;431432do433{434output_size++;435436if (output < output_end)437*output++ = *out_str++;438}439while (--length != 0);440441out->output = output;442out->output_size = output_size;443}444445446/* Prints the separator into the output.447448Arguments:449out output context450separator glob separator451with_escape backslash is needed before separator452*/453454static void455convert_glob_print_separator(pcre2_output_context *out,456PCRE2_UCHAR separator, BOOL with_escape)457{458if (with_escape)459convert_glob_write(out, CHAR_BACKSLASH);460461convert_glob_write(out, separator);462}463464465/* Prints a wildcard into the output.466467Arguments:468out output context469separator glob separator470with_escape backslash is needed before separator471*/472473static void474convert_glob_print_wildcard(pcre2_output_context *out,475PCRE2_UCHAR separator, BOOL with_escape)476{477out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;478out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;479convert_glob_write_str(out, 2);480481convert_glob_print_separator(out, separator, with_escape);482483convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET);484}485486487/* Parse a posix class.488489Arguments:490from starting point of scanning the range491pattern_end end of pattern492out output context493494Returns: >0 => class index4950 => malformed class496*/497498static int499convert_glob_parse_class(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,500pcre2_output_context *out)501{502PCRE2_SPTR start = *from + 1;503PCRE2_SPTR pattern = start;504const char *class_ptr;505PCRE2_UCHAR c;506int class_index;507508while (TRUE)509{510if (pattern >= pattern_end) return 0;511512c = *pattern++;513514if (c < CHAR_a || c > CHAR_z) break;515}516517if (c != CHAR_COLON || pattern >= pattern_end ||518*pattern != CHAR_RIGHT_SQUARE_BRACKET)519return 0;520521class_ptr = posix_classes;522class_index = 1;523524while (TRUE)525{526if (*class_ptr == 0) return 0;527528pattern = start;529530while (*pattern == (PCRE2_UCHAR) *class_ptr)531{532if (*pattern == CHAR_COLON)533{534pattern += 2;535start -= 2;536537do convert_glob_write(out, *start++); while (start < pattern);538539*from = pattern;540return class_index;541}542pattern++;543class_ptr++;544}545546while (*class_ptr != CHAR_COLON) class_ptr++;547class_ptr++;548class_index++;549}550}551552/* Checks whether the character is in the class.553554Arguments:555class_index class index556c character557558Returns: !0 => character is found in the class5590 => otherwise560*/561562static BOOL563convert_glob_char_in_class(int class_index, PCRE2_UCHAR c)564{565const uint8_t *cbits = PRIV(default_tables) + cbits_offset;566int cbit;567568#if PCRE2_CODE_UNIT_WIDTH != 8569if (c > 0xff)570{571/* Can't access the character tables for c > 0xff */572return FALSE;573}574#endif575576/* See posix_class_maps. This is a small local clone of that.577Note that we don't know exactly what character tables will be used at578match time, but, for the purposes of pattern conversion, it should be579sufficient to use PCRE2's built-in default tables. */580581switch (class_index)582{583case 1: /* alpha */584if (c == CHAR_UNDERSCORE) return FALSE;585if (((cbits + cbit_digit)[c/8] & (1u << (c&7))) != 0) return FALSE;586cbit = cbit_word;587break;588589case 2: cbit = cbit_lower; break; /* lower */590case 3: cbit = cbit_upper; break; /* upper */591592case 4: /* alnum */593if (c == CHAR_UNDERSCORE) return FALSE;594cbit = cbit_word;595break;596597case 5: /* ascii */598if (((cbits + cbit_cntrl)[c/8] & (1u << (c&7))) != 0) return TRUE;599cbit = cbit_print;600break;601602case 6: /* blank */603if (c == CHAR_LF || c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)604return FALSE;605cbit = cbit_space;606break;607608case 7: cbit = cbit_cntrl; break; /* cntrl */609case 8: cbit = cbit_digit; break; /* digit */610case 9: cbit = cbit_graph; break; /* graph */611case 10: cbit = cbit_print; break; /* print */612case 11: cbit = cbit_punct; break; /* punct */613case 12: cbit = cbit_space; break; /* space */614case 13: cbit = cbit_word; break; /* word */615case 14: cbit = cbit_xdigit; break; /* xdigit */616default: return FALSE;617}618619return ((cbits + cbit)[c/8] & (1u << (c&7))) != 0;620}621622/* Parse a range of characters.623624Arguments:625from starting point of scanning the range626pattern_end end of pattern627out output context628separator glob separator629with_escape backslash is needed before separator630631Returns: 0 => success632!0 => error code633*/634635static int636convert_glob_parse_range(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,637pcre2_output_context *out, BOOL utf, PCRE2_UCHAR separator,638BOOL with_escape, PCRE2_UCHAR escape, BOOL no_wildsep)639{640BOOL is_negative = FALSE;641BOOL separator_seen = FALSE;642BOOL has_prev_c;643PCRE2_SPTR pattern = *from;644PCRE2_SPTR char_start = NULL;645uint32_t c, prev_c;646int len, class_index;647648(void)utf; /* Avoid compiler warning. */649650if (pattern >= pattern_end)651{652*from = pattern;653return PCRE2_ERROR_MISSING_SQUARE_BRACKET;654}655656if (*pattern == CHAR_EXCLAMATION_MARK657|| *pattern == CHAR_CIRCUMFLEX_ACCENT)658{659pattern++;660661if (pattern >= pattern_end)662{663*from = pattern;664return PCRE2_ERROR_MISSING_SQUARE_BRACKET;665}666667is_negative = TRUE;668669out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;670out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;671len = 2;672673if (!no_wildsep)674{675if (with_escape)676{677out->out_str[len] = CHAR_BACKSLASH;678len++;679}680out->out_str[len] = (uint8_t) separator;681}682683convert_glob_write_str(out, len + 1);684}685else686convert_glob_write(out, CHAR_LEFT_SQUARE_BRACKET);687688has_prev_c = FALSE;689prev_c = 0;690691if (*pattern == CHAR_RIGHT_SQUARE_BRACKET)692{693out->out_str[0] = CHAR_BACKSLASH;694out->out_str[1] = CHAR_RIGHT_SQUARE_BRACKET;695convert_glob_write_str(out, 2);696has_prev_c = TRUE;697prev_c = CHAR_RIGHT_SQUARE_BRACKET;698pattern++;699}700701while (pattern < pattern_end)702{703char_start = pattern;704GETCHARINCTEST(c, pattern);705706if (c == CHAR_RIGHT_SQUARE_BRACKET)707{708convert_glob_write(out, c);709710if (!is_negative && !no_wildsep && separator_seen)711{712out->out_str[0] = CHAR_LEFT_PARENTHESIS;713out->out_str[1] = CHAR_QUESTION_MARK;714out->out_str[2] = CHAR_LESS_THAN_SIGN;715out->out_str[3] = CHAR_EXCLAMATION_MARK;716convert_glob_write_str(out, 4);717718convert_glob_print_separator(out, separator, with_escape);719convert_glob_write(out, CHAR_RIGHT_PARENTHESIS);720}721722*from = pattern;723return 0;724}725726if (pattern >= pattern_end) break;727728if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON)729{730*from = pattern;731class_index = convert_glob_parse_class(from, pattern_end, out);732733if (class_index != 0)734{735pattern = *from;736737has_prev_c = FALSE;738prev_c = 0;739740if (!is_negative &&741convert_glob_char_in_class (class_index, separator))742separator_seen = TRUE;743continue;744}745}746else if (c == CHAR_MINUS && has_prev_c &&747*pattern != CHAR_RIGHT_SQUARE_BRACKET)748{749convert_glob_write(out, CHAR_MINUS);750751char_start = pattern;752GETCHARINCTEST(c, pattern);753754if (pattern >= pattern_end) break;755756if (escape != 0 && c == escape)757{758char_start = pattern;759GETCHARINCTEST(c, pattern);760}761else if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON)762{763*from = pattern;764return PCRE2_ERROR_CONVERT_SYNTAX;765}766767if (prev_c > c)768{769*from = pattern;770return PCRE2_ERROR_CONVERT_SYNTAX;771}772773if (prev_c < separator && separator < c) separator_seen = TRUE;774775has_prev_c = FALSE;776prev_c = 0;777}778else779{780if (escape != 0 && c == escape)781{782char_start = pattern;783GETCHARINCTEST(c, pattern);784785if (pattern >= pattern_end) break;786}787788has_prev_c = TRUE;789prev_c = c;790}791792if (c == CHAR_LEFT_SQUARE_BRACKET || c == CHAR_RIGHT_SQUARE_BRACKET ||793c == CHAR_BACKSLASH || c == CHAR_MINUS)794convert_glob_write(out, CHAR_BACKSLASH);795796if (c == separator) separator_seen = TRUE;797798do convert_glob_write(out, *char_start++); while (char_start < pattern);799}800801*from = pattern;802return PCRE2_ERROR_MISSING_SQUARE_BRACKET;803}804805806/* Prints a (*COMMIT) into the output.807808Arguments:809out output context810*/811812static void813convert_glob_print_commit(pcre2_output_context *out)814{815out->out_str[0] = CHAR_LEFT_PARENTHESIS;816out->out_str[1] = CHAR_ASTERISK;817out->out_str[2] = CHAR_C;818out->out_str[3] = CHAR_O;819out->out_str[4] = CHAR_M;820out->out_str[5] = CHAR_M;821out->out_str[6] = CHAR_I;822out->out_str[7] = CHAR_T;823convert_glob_write_str(out, 8);824convert_glob_write(out, CHAR_RIGHT_PARENTHESIS);825}826827828/* Bash glob converter.829830Arguments:831pattype the pattern type832pattern the pattern833plength length in code units834utf TRUE if UTF835use_buffer where to put the output836use_length length of use_buffer837bufflenptr where to put the used length838dummyrun TRUE if a dummy run839ccontext the convert context840841Returns: 0 => success842!0 => error code843*/844845static int846convert_glob(uint32_t options, PCRE2_SPTR pattern, PCRE2_SIZE plength,847BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length,848PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext)849{850pcre2_output_context out;851PCRE2_SPTR pattern_start = pattern;852PCRE2_SPTR pattern_end = pattern + plength;853PCRE2_UCHAR separator = ccontext->glob_separator;854PCRE2_UCHAR escape = ccontext->glob_escape;855PCRE2_UCHAR c;856BOOL no_wildsep = (options & PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR) != 0;857BOOL no_starstar = (options & PCRE2_CONVERT_GLOB_NO_STARSTAR) != 0;858BOOL in_atomic = FALSE;859BOOL after_starstar = FALSE;860BOOL no_slash_z = FALSE;861BOOL with_escape, is_start, after_separator;862int result = 0;863864(void)utf; /* Avoid compiler warning. */865866#ifdef SUPPORT_UNICODE867if (utf && (separator >= 128 || escape >= 128))868{869/* Currently only ASCII characters are supported. */870*bufflenptr = 0;871return PCRE2_ERROR_CONVERT_SYNTAX;872}873#endif874875with_escape = strchr(pcre2_escaped_literals, separator) != NULL;876877/* Initialize default for error offset as end of input. */878out.output = use_buffer;879out.output_end = use_buffer + use_length;880out.output_size = 0;881882out.out_str[0] = CHAR_LEFT_PARENTHESIS;883out.out_str[1] = CHAR_QUESTION_MARK;884out.out_str[2] = CHAR_s;885out.out_str[3] = CHAR_RIGHT_PARENTHESIS;886convert_glob_write_str(&out, 4);887888is_start = TRUE;889890if (pattern < pattern_end && pattern[0] == CHAR_ASTERISK)891{892if (no_wildsep)893is_start = FALSE;894else if (!no_starstar && pattern + 1 < pattern_end &&895pattern[1] == CHAR_ASTERISK)896is_start = FALSE;897}898899if (is_start)900{901out.out_str[0] = CHAR_BACKSLASH;902out.out_str[1] = CHAR_A;903convert_glob_write_str(&out, 2);904}905906while (pattern < pattern_end)907{908c = *pattern++;909910if (c == CHAR_ASTERISK)911{912is_start = pattern == pattern_start + 1;913914if (in_atomic)915{916convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS);917in_atomic = FALSE;918}919920if (!no_starstar && pattern < pattern_end && *pattern == CHAR_ASTERISK)921{922after_separator = is_start || (pattern[-2] == separator);923924do pattern++; while (pattern < pattern_end &&925*pattern == CHAR_ASTERISK);926927if (pattern >= pattern_end)928{929no_slash_z = TRUE;930break;931}932933after_starstar = TRUE;934935if (after_separator && escape != 0 && *pattern == escape &&936pattern + 1 < pattern_end && pattern[1] == separator)937pattern++;938939if (is_start)940{941if (*pattern != separator) continue;942943out.out_str[0] = CHAR_LEFT_PARENTHESIS;944out.out_str[1] = CHAR_QUESTION_MARK;945out.out_str[2] = CHAR_COLON;946out.out_str[3] = CHAR_BACKSLASH;947out.out_str[4] = CHAR_A;948out.out_str[5] = CHAR_VERTICAL_LINE;949convert_glob_write_str(&out, 6);950951convert_glob_print_separator(&out, separator, with_escape);952convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS);953954pattern++;955continue;956}957958convert_glob_print_commit(&out);959960if (!after_separator || *pattern != separator)961{962out.out_str[0] = CHAR_DOT;963out.out_str[1] = CHAR_ASTERISK;964out.out_str[2] = CHAR_QUESTION_MARK;965convert_glob_write_str(&out, 3);966continue;967}968969out.out_str[0] = CHAR_LEFT_PARENTHESIS;970out.out_str[1] = CHAR_QUESTION_MARK;971out.out_str[2] = CHAR_COLON;972out.out_str[3] = CHAR_DOT;973out.out_str[4] = CHAR_ASTERISK;974out.out_str[5] = CHAR_QUESTION_MARK;975976convert_glob_write_str(&out, 6);977978convert_glob_print_separator(&out, separator, with_escape);979980out.out_str[0] = CHAR_RIGHT_PARENTHESIS;981out.out_str[1] = CHAR_QUESTION_MARK;982out.out_str[2] = CHAR_QUESTION_MARK;983convert_glob_write_str(&out, 3);984985pattern++;986continue;987}988989if (pattern < pattern_end && *pattern == CHAR_ASTERISK)990{991do pattern++; while (pattern < pattern_end &&992*pattern == CHAR_ASTERISK);993}994995if (no_wildsep)996{997if (pattern >= pattern_end)998{999no_slash_z = TRUE;1000break;1001}10021003/* Start check must be after the end check. */1004if (is_start) continue;1005}10061007if (!is_start)1008{1009if (after_starstar)1010{1011out.out_str[0] = CHAR_LEFT_PARENTHESIS;1012out.out_str[1] = CHAR_QUESTION_MARK;1013out.out_str[2] = CHAR_GREATER_THAN_SIGN;1014convert_glob_write_str(&out, 3);1015in_atomic = TRUE;1016}1017else1018convert_glob_print_commit(&out);1019}10201021if (no_wildsep)1022convert_glob_write(&out, CHAR_DOT);1023else1024convert_glob_print_wildcard(&out, separator, with_escape);10251026out.out_str[0] = CHAR_ASTERISK;1027out.out_str[1] = CHAR_QUESTION_MARK;1028if (pattern >= pattern_end)1029out.out_str[1] = CHAR_PLUS;1030convert_glob_write_str(&out, 2);1031continue;1032}10331034if (c == CHAR_QUESTION_MARK)1035{1036if (no_wildsep)1037convert_glob_write(&out, CHAR_DOT);1038else1039convert_glob_print_wildcard(&out, separator, with_escape);1040continue;1041}10421043if (c == CHAR_LEFT_SQUARE_BRACKET)1044{1045result = convert_glob_parse_range(&pattern, pattern_end,1046&out, utf, separator, with_escape, escape, no_wildsep);1047if (result != 0) break;1048continue;1049}10501051if (escape != 0 && c == escape)1052{1053if (pattern >= pattern_end)1054{1055result = PCRE2_ERROR_CONVERT_SYNTAX;1056break;1057}1058c = *pattern++;1059}10601061if (c < 255 && strchr(pcre2_escaped_literals, c) != NULL)1062convert_glob_write(&out, CHAR_BACKSLASH);10631064convert_glob_write(&out, c);1065}10661067if (result == 0)1068{1069if (!no_slash_z)1070{1071out.out_str[0] = CHAR_BACKSLASH;1072out.out_str[1] = CHAR_z;1073convert_glob_write_str(&out, 2);1074}10751076if (in_atomic)1077convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS);10781079convert_glob_write(&out, CHAR_NUL);10801081if (!dummyrun && out.output_size != (PCRE2_SIZE) (out.output - use_buffer))1082result = PCRE2_ERROR_NOMEMORY;1083}10841085if (result != 0)1086{1087*bufflenptr = pattern - pattern_start;1088return result;1089}10901091*bufflenptr = out.output_size - 1;1092return 0;1093}109410951096/*************************************************1097* Convert pattern *1098*************************************************/10991100/* This is the external-facing function for converting other forms of pattern1101into PCRE2 regular expression patterns. On error, the bufflenptr argument is1102used to return an offset in the original pattern.11031104Arguments:1105pattern the input pattern1106plength length of input, or PCRE2_ZERO_TERMINATED1107options options bits1108buffptr pointer to pointer to output buffer1109bufflenptr pointer to length of output buffer1110ccontext convert context or NULL11111112Returns: 0 for success, else an error code (+ve or -ve)1113*/11141115PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION1116pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE plength, uint32_t options,1117PCRE2_UCHAR **buffptr, PCRE2_SIZE *bufflenptr,1118pcre2_convert_context *ccontext)1119{1120int rc;1121PCRE2_UCHAR null_str[1] = { 0xcd };1122PCRE2_UCHAR dummy_buffer[DUMMY_BUFFER_SIZE];1123PCRE2_UCHAR *use_buffer = dummy_buffer;1124PCRE2_SIZE use_length = DUMMY_BUFFER_SIZE;1125BOOL utf = (options & PCRE2_CONVERT_UTF) != 0;1126uint32_t pattype = options & TYPE_OPTIONS;11271128if (pattern == NULL && plength == 0)1129pattern = null_str;11301131if (pattern == NULL || bufflenptr == NULL)1132{1133if (bufflenptr != NULL) *bufflenptr = 0; /* Error offset */1134return PCRE2_ERROR_NULL;1135}11361137if ((options & ~ALL_OPTIONS) != 0 || /* Undefined bit set */1138(pattype & (~pattype+1)) != pattype || /* More than one type set */1139pattype == 0) /* No type set */1140{1141*bufflenptr = 0; /* Error offset */1142return PCRE2_ERROR_BADOPTION;1143}11441145if (plength == PCRE2_ZERO_TERMINATED) plength = PRIV(strlen)(pattern);1146if (ccontext == NULL) ccontext =1147(pcre2_convert_context *)(&PRIV(default_convert_context));11481149/* Check UTF if required. */11501151#ifndef SUPPORT_UNICODE1152if (utf)1153{1154*bufflenptr = 0; /* Error offset */1155return PCRE2_ERROR_UNICODE_NOT_SUPPORTED;1156}1157#else1158if (utf && (options & PCRE2_CONVERT_NO_UTF_CHECK) == 0)1159{1160PCRE2_SIZE erroroffset;1161rc = PRIV(valid_utf)(pattern, plength, &erroroffset);1162if (rc != 0)1163{1164*bufflenptr = erroroffset;1165return rc;1166}1167}1168#endif11691170/* If buffptr is not NULL, and what it points to is not NULL, we are being1171provided with a buffer and a length, so set them as the buffer to use. */11721173if (buffptr != NULL && *buffptr != NULL)1174{1175use_buffer = *buffptr;1176use_length = *bufflenptr;1177}11781179/* Call an individual converter, either just once (if a buffer was provided or1180just the length is needed), or twice (if a memory allocation is required). */11811182for (int i = 0; i < 2; i++)1183{1184PCRE2_UCHAR *allocated;1185BOOL dummyrun = buffptr == NULL || *buffptr == NULL;11861187switch(pattype)1188{1189case PCRE2_CONVERT_GLOB:1190rc = convert_glob(options & ~PCRE2_CONVERT_GLOB, pattern, plength, utf,1191use_buffer, use_length, bufflenptr, dummyrun, ccontext);1192break;11931194case PCRE2_CONVERT_POSIX_BASIC:1195case PCRE2_CONVERT_POSIX_EXTENDED:1196rc = convert_posix(pattype, pattern, plength, utf, use_buffer, use_length,1197bufflenptr, dummyrun, ccontext);1198break;11991200/* We have already validated pattype. */1201/* LCOV_EXCL_START */1202default:1203PCRE2_DEBUG_UNREACHABLE();1204*bufflenptr = 0; /* Error offset */1205return PCRE2_ERROR_INTERNAL;1206/* LCOV_EXCL_STOP */1207}12081209if (rc != 0 || /* Error */1210buffptr == NULL || /* Just the length is required */1211*buffptr != NULL) /* Buffer was provided or allocated */1212return rc;12131214/* Allocate memory for the buffer, with hidden space for an allocator at1215the start. The next time round the loop runs the conversion for real. */12161217allocated = PRIV(memctl_malloc)(sizeof(pcre2_memctl) +1218(*bufflenptr + 1)*PCRE2_CODE_UNIT_WIDTH, (pcre2_memctl *)ccontext);1219if (allocated == NULL)1220{1221*bufflenptr = 0; /* Error offset */1222return PCRE2_ERROR_NOMEMORY;1223}1224*buffptr = (PCRE2_UCHAR *)(((char *)allocated) + sizeof(pcre2_memctl));12251226use_buffer = *buffptr;1227use_length = *bufflenptr + 1;1228}12291230/* Running the loop above ought to have succeeded the second time. */1231/* LCOV_EXCL_START */1232PCRE2_DEBUG_UNREACHABLE();1233*bufflenptr = 0; /* Error offset */1234return PCRE2_ERROR_INTERNAL;1235/* LCOV_EXCL_STOP */1236}123712381239/*************************************************1240* Free converted pattern *1241*************************************************/12421243/* This frees a converted pattern that was put in newly-allocated memory.12441245Argument: the converted pattern1246Returns: nothing1247*/12481249PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION1250pcre2_converted_pattern_free(PCRE2_UCHAR *converted)1251{1252if (converted != NULL)1253{1254pcre2_memctl *memctl =1255(pcre2_memctl *)((char *)converted - sizeof(pcre2_memctl));1256memctl->free(memctl, memctl->memory_data);1257}1258}12591260/* End of pcre2_convert.c */126112621263