// Copyright 2014 The Kyua Authors.1// All rights reserved.2//3// Redistribution and use in source and binary forms, with or without4// modification, are permitted provided that the following conditions are5// met:6//7// * Redistributions of source code must retain the above copyright8// notice, this list of conditions and the following disclaimer.9// * Redistributions in binary form must reproduce the above copyright10// notice, this list of conditions and the following disclaimer in the11// documentation and/or other materials provided with the distribution.12// * Neither the name of Google Inc. nor the names of its contributors13// may be used to endorse or promote products derived from this software14// without specific prior written permission.15//16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.2728#include "utils/text/regex.hpp"2930extern "C" {31#include <sys/types.h>3233#include <regex.h>34}3536#include "utils/auto_array.ipp"37#include "utils/defs.hpp"38#include "utils/format/macros.hpp"39#include "utils/noncopyable.hpp"40#include "utils/sanity.hpp"41#include "utils/text/exceptions.hpp"4243namespace text = utils::text;444546namespace {474849static void throw_regex_error(const int, const ::regex_t*, const std::string&)50UTILS_NORETURN;515253/// Constructs and raises a regex_error.54///55/// \param error The error code returned by regcomp(3) or regexec(3).56/// \param preg The native regex object that caused this error.57/// \param prefix Error message prefix string.58///59/// \throw regex_error The constructed exception.60static void61throw_regex_error(const int error, const ::regex_t* preg,62const std::string& prefix)63{64char buffer[1024];6566// TODO(jmmv): Would be nice to handle the case where the message does67// not fit in the temporary buffer.68(void)::regerror(error, preg, buffer, sizeof(buffer));6970throw text::regex_error(F("%s: %s") % prefix % buffer);71}727374} // anonymous namespace757677/// Internal implementation for regex_matches.78struct utils::text::regex_matches::impl : utils::noncopyable {79/// String on which we are matching.80///81/// In theory, we could take a reference here instead of a copy, and make82/// it a requirement for the caller to ensure that the lifecycle of the83/// input string outlasts the lifecycle of the regex_matches. However, that84/// contract is very easy to break with hardcoded strings (as we do in85/// tests). Just go for the safer case here.86const std::string _string;8788/// Maximum number of matching groups we expect, including the full match.89///90/// In other words, this is the size of the _matches array.91const std::size_t _nmatches;9293/// Native regular expression match representation.94utils::auto_array< ::regmatch_t > _matches;9596/// Constructor.97///98/// This executes the regex on the given string and sets up the internal99/// class state based on the results.100///101/// \param preg The native regex object.102/// \param str The string on which to execute the regex.103/// \param ngroups Number of capture groups in the regex. This is an upper104/// bound and may be greater than the actual matches.105///106/// \throw regex_error If the call to regexec(3) fails.107impl(const ::regex_t* preg, const std::string& str,108const std::size_t ngroups) :109_string(str),110_nmatches(ngroups + 1),111_matches(new ::regmatch_t[_nmatches])112{113const int error = ::regexec(preg, _string.c_str(), _nmatches,114_matches.get(), 0);115if (error == REG_NOMATCH) {116_matches.reset(NULL);117} else if (error != 0) {118throw_regex_error(error, preg,119F("regexec on '%s' failed") % _string);120}121}122123/// Destructor.124~impl(void)125{126}127};128129130/// Constructor.131///132/// \param pimpl Constructed implementation of the object.133text::regex_matches::regex_matches(std::shared_ptr< impl > pimpl) :134_pimpl(pimpl)135{136}137138139/// Destructor.140text::regex_matches::~regex_matches(void)141{142}143144145/// Returns the number of matches in this object.146///147/// Note that this does not correspond to the number of groups provided at148/// construction time. The returned value here accounts for only the returned149/// valid matches.150///151/// \return Number of matches, including the full match.152std::size_t153text::regex_matches::count(void) const154{155std::size_t total = 0;156if (_pimpl->_matches.get() != NULL) {157for (std::size_t i = 0; i < _pimpl->_nmatches; ++i) {158if (_pimpl->_matches[i].rm_so != -1)159++total;160}161INV(total <= _pimpl->_nmatches);162}163return total;164}165166167/// Gets a match.168///169/// \param index Number of the match to get. Index 0 always contains the match170/// of the whole regex.171///172/// \pre There regex must have matched the input string.173/// \pre index must be lower than count().174///175/// \return The textual match.176std::string177text::regex_matches::get(const std::size_t index) const178{179PRE(*this);180PRE(index < count());181182const ::regmatch_t* match = &_pimpl->_matches[index];183184return std::string(_pimpl->_string.c_str() + match->rm_so,185match->rm_eo - match->rm_so);186}187188189/// Checks if there are any matches.190///191/// \return True if the object contains one or more matches; false otherwise.192text::regex_matches::operator bool(void) const193{194return _pimpl->_matches.get() != NULL;195}196197198/// Internal implementation for regex.199struct utils::text::regex::impl : utils::noncopyable {200/// Native regular expression representation.201::regex_t _preg;202203/// Number of capture groups in the regular expression. This is an upper204/// bound and does NOT include the default full string match.205std::size_t _ngroups;206207/// Constructor.208///209/// This compiles the given regular expression.210///211/// \param regex_ The regular expression to compile.212/// \param ngroups Number of capture groups in the regular expression. This213/// is an upper bound and does NOT include the default full string214/// match.215/// \param ignore_case Whether to ignore case during matching.216///217/// \throw regex_error If the call to regcomp(3) fails.218impl(const std::string& regex_, const std::size_t ngroups,219const bool ignore_case) :220_ngroups(ngroups)221{222const int flags = REG_EXTENDED | (ignore_case ? REG_ICASE : 0);223const int error = ::regcomp(&_preg, regex_.c_str(), flags);224if (error != 0)225throw_regex_error(error, &_preg, F("regcomp on '%s' failed")226% regex_);227}228229/// Destructor.230~impl(void)231{232::regfree(&_preg);233}234};235236237/// Constructor.238///239/// \param pimpl Constructed implementation of the object.240text::regex::regex(std::shared_ptr< impl > pimpl) : _pimpl(pimpl)241{242}243244245/// Destructor.246text::regex::~regex(void)247{248}249250251/// Compiles a new regular expression.252///253/// \param regex_ The regular expression to compile.254/// \param ngroups Number of capture groups in the regular expression. This is255/// an upper bound and does NOT include the default full string match.256/// \param ignore_case Whether to ignore case during matching.257///258/// \return A new regular expression, ready to match strings.259///260/// \throw regex_error If the regular expression is invalid and cannot be261/// compiled.262text::regex263text::regex::compile(const std::string& regex_, const std::size_t ngroups,264const bool ignore_case)265{266return regex(std::shared_ptr< impl >(new impl(regex_, ngroups,267ignore_case)));268}269270271/// Matches the regular expression against a string.272///273/// \param str String to match the regular expression against.274///275/// \return A new regex_matches object with the results of the match.276text::regex_matches277text::regex::match(const std::string& str) const278{279std::shared_ptr< regex_matches::impl > pimpl(new regex_matches::impl(280&_pimpl->_preg, str, _pimpl->_ngroups));281return regex_matches(pimpl);282}283284285/// Compiles and matches a regular expression once.286///287/// This is syntactic sugar to simplify the instantiation of a new regex object288/// and its subsequent match on a string.289///290/// \param regex_ The regular expression to compile and match.291/// \param str String to match the regular expression against.292/// \param ngroups Number of capture groups in the regular expression.293/// \param ignore_case Whether to ignore case during matching.294///295/// \return A new regex_matches object with the results of the match.296text::regex_matches297text::match_regex(const std::string& regex_, const std::string& str,298const std::size_t ngroups, const bool ignore_case)299{300return regex::compile(regex_, ngroups, ignore_case).match(str);301}302303304