badvpn/ncd/modules/regex_match.c
Ambroz Bizjak 347fa1a7a5 ncd: Store the NCDStingIndex pointer in NCDValMem.
This cuts down on the boilerplate especially with ncd_make_boolean().
2015-05-09 00:01:49 +02:00

363 lines
11 KiB
C

/**
* @file regex_match.c
* @author Ambroz Bizjak <ambrop7@gmail.com>
*
* @section LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the author nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @section DESCRIPTION
*
* Regular expression matching module.
*
* Synopsis:
* regex_match(string input, string regex)
*
* Variables:
* succeeded - "true" or "false", indicating whether input matched regex
* matchN - for N=0,1,2,..., the matching data for the N-th subexpression
* (match0 = whole match)
*
* Description:
* Matches 'input' with the POSIX extended regular expression 'regex'.
* 'regex' must be a string without null bytes, but 'input' can contain null bytes.
* However, it's difficult, if not impossible, to actually match nulls with the regular
* expression.
* The input and regex strings are interpreted according to the POSIX regex functions
* (regcomp(), regexec()); in particular, the current locale setting affects the
* interpretation.
*
* Synopsis:
* regex_replace(string input, list(string) regex, list(string) replace)
*
* Variables:
* string (empty) - transformed input
*
* Description:
* Replaces matching parts of a string. Replacement is performed by repetedly matching
* the remaining part of the string with all regular expressions. On each step, out of
* all regular expressions that match the remainder of the string, the one whose match
* starts at the least position wins, and the matching part is replaced with the
* replacement string corresponding to this regular expression. The process continues
* from the end of the just-replaced portion until no more regular expressions match.
* If multiple regular expressions match at the least position, the one that appears
* first in the 'regex' argument wins.
*/
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <regex.h>
#include <misc/string_begins_with.h>
#include <misc/parse_number.h>
#include <misc/expstring.h>
#include <misc/debug.h>
#include <misc/balloc.h>
#include <ncd/module_common.h>
#include <generated/blog_channel_ncd_regex_match.h>
#define MAX_MATCHES 64
struct instance {
NCDModuleInst *i;
MemRef input;
int succeeded;
int num_matches;
regmatch_t matches[MAX_MATCHES];
};
struct replace_instance {
NCDModuleInst *i;
MemRef output;
};
static void func_new (void *vo, NCDModuleInst *i, const struct NCDModuleInst_new_params *params)
{
struct instance *o = vo;
o->i = i;
// read arguments
NCDValRef input_arg;
NCDValRef regex_arg;
if (!NCDVal_ListRead(params->args, 2, &input_arg, &regex_arg)) {
ModuleLog(o->i, BLOG_ERROR, "wrong arity");
goto fail0;
}
if (!NCDVal_IsString(input_arg) || !NCDVal_IsStringNoNulls(regex_arg)) {
ModuleLog(o->i, BLOG_ERROR, "wrong type");
goto fail0;
}
o->input = NCDVal_StringMemRef(input_arg);
// make sure we don't overflow regoff_t
if (o->input.len > INT_MAX) {
ModuleLog(o->i, BLOG_ERROR, "input string too long");
goto fail0;
}
// null terminate regex
NCDValNullTermString regex_nts;
if (!NCDVal_StringNullTerminate(regex_arg, &regex_nts)) {
ModuleLog(i, BLOG_ERROR, "NCDVal_StringNullTerminate failed");
goto fail0;
}
// compile regex
regex_t preg;
int ret = regcomp(&preg, regex_nts.data, REG_EXTENDED);
NCDValNullTermString_Free(&regex_nts);
if (ret != 0) {
ModuleLog(o->i, BLOG_ERROR, "regcomp failed (error=%d)", ret);
goto fail0;
}
// execute match
o->matches[0].rm_so = 0;
o->matches[0].rm_eo = o->input.len;
o->succeeded = (regexec(&preg, o->input.ptr, MAX_MATCHES, o->matches, REG_STARTEND) == 0);
// free regex
regfree(&preg);
// signal up
NCDModuleInst_Backend_Up(o->i);
return;
fail0:
NCDModuleInst_Backend_DeadError(i);
}
static int func_getvar (void *vo, const char *name, NCDValMem *mem, NCDValRef *out)
{
struct instance *o = vo;
if (!strcmp(name, "succeeded")) {
*out = ncd_make_boolean(mem, o->succeeded);
return 1;
}
size_t pos;
uintmax_t n;
if ((pos = string_begins_with(name, "match")) && parse_unsigned_integer(MemRef_MakeCstr(name + pos), &n)) {
if (o->succeeded && n < MAX_MATCHES && o->matches[n].rm_so >= 0) {
regmatch_t *m = &o->matches[n];
ASSERT(m->rm_so <= o->input.len)
ASSERT(m->rm_eo >= m->rm_so)
ASSERT(m->rm_eo <= o->input.len)
size_t len = m->rm_eo - m->rm_so;
*out = NCDVal_NewStringBinMr(mem, MemRef_Sub(o->input, m->rm_so, len));
return 1;
}
}
return 0;
}
static void replace_func_new (void *vo, NCDModuleInst *i, const struct NCDModuleInst_new_params *params)
{
struct replace_instance *o = vo;
o->i = i;
// read arguments
NCDValRef input_arg;
NCDValRef regex_arg;
NCDValRef replace_arg;
if (!NCDVal_ListRead(params->args, 3, &input_arg, &regex_arg, &replace_arg)) {
ModuleLog(i, BLOG_ERROR, "wrong arity");
goto fail1;
}
if (!NCDVal_IsString(input_arg) || !NCDVal_IsList(regex_arg) || !NCDVal_IsList(replace_arg)) {
ModuleLog(i, BLOG_ERROR, "wrong type");
goto fail1;
}
// check number of regex/replace
if (NCDVal_ListCount(regex_arg) != NCDVal_ListCount(replace_arg)) {
ModuleLog(i, BLOG_ERROR, "number of regex's is not the same as number of replacements");
goto fail1;
}
size_t num_regex = NCDVal_ListCount(regex_arg);
// allocate array for compiled regex's
regex_t *regs = BAllocArray(num_regex, sizeof(regs[0]));
if (!regs) {
ModuleLog(i, BLOG_ERROR, "BAllocArray failed");
goto fail1;
}
size_t num_done_regex = 0;
// compile regex's, check arguments
while (num_done_regex < num_regex) {
NCDValRef regex = NCDVal_ListGet(regex_arg, num_done_regex);
NCDValRef replace = NCDVal_ListGet(replace_arg, num_done_regex);
if (!NCDVal_IsStringNoNulls(regex) || !NCDVal_IsString(replace)) {
ModuleLog(i, BLOG_ERROR, "wrong regex/replace type for pair %zu", num_done_regex);
goto fail2;
}
// null terminate regex
NCDValNullTermString regex_nts;
if (!NCDVal_StringNullTerminate(regex, &regex_nts)) {
ModuleLog(i, BLOG_ERROR, "NCDVal_StringNullTerminate failed");
goto fail2;
}
int res = regcomp(&regs[num_done_regex], regex_nts.data, REG_EXTENDED);
NCDValNullTermString_Free(&regex_nts);
if (res != 0) {
ModuleLog(i, BLOG_ERROR, "regcomp failed for pair %zu (error=%d)", num_done_regex, res);
goto fail2;
}
num_done_regex++;
}
// init output string
ExpString out;
if (!ExpString_Init(&out)) {
ModuleLog(i, BLOG_ERROR, "ExpString_Init failed");
goto fail2;
}
// input state
MemRef in = NCDVal_StringMemRef(input_arg);
size_t in_pos = 0;
// process input
while (in_pos < in.len) {
// find first match
int have_match = 0;
size_t match_regex = 0; // to remove warning
regmatch_t match = {0, 0}; // to remove warning
for (size_t j = 0; j < num_regex; j++) {
regmatch_t this_match;
this_match.rm_so = 0;
this_match.rm_eo = in.len - in_pos;
if (regexec(&regs[j], in.ptr + in_pos, 1, &this_match, REG_STARTEND) == 0 && (!have_match || this_match.rm_so < match.rm_so)) {
have_match = 1;
match_regex = j;
match = this_match;
}
}
// if no match, append remaining data and finish
if (!have_match) {
if (!ExpString_AppendBinaryMr(&out, MemRef_SubFrom(in, in_pos))) {
ModuleLog(i, BLOG_ERROR, "ExpString_AppendBinaryMr failed");
goto fail3;
}
break;
}
// append data before match
if (!ExpString_AppendBinaryMr(&out, MemRef_Sub(in, in_pos, match.rm_so))) {
ModuleLog(i, BLOG_ERROR, "ExpString_AppendBinaryMr failed");
goto fail3;
}
// append replacement data
NCDValRef replace = NCDVal_ListGet(replace_arg, match_regex);
if (!ExpString_AppendBinaryMr(&out, NCDVal_StringMemRef(replace))) {
ModuleLog(i, BLOG_ERROR, "ExpString_AppendBinaryMr failed");
goto fail3;
}
in_pos += match.rm_eo;
}
// set output
o->output = ExpString_GetMr(&out);
// free compiled regex's
while (num_done_regex-- > 0) {
regfree(&regs[num_done_regex]);
}
// free array
BFree(regs);
// signal up
NCDModuleInst_Backend_Up(i);
return;
fail3:
ExpString_Free(&out);
fail2:
while (num_done_regex-- > 0) {
regfree(&regs[num_done_regex]);
}
BFree(regs);
fail1:
NCDModuleInst_Backend_DeadError(i);
}
static void replace_func_die (void *vo)
{
struct replace_instance *o = vo;
// free output
BFree((char *)o->output.ptr);
NCDModuleInst_Backend_Dead(o->i);
}
static int replace_func_getvar (void *vo, const char *name, NCDValMem *mem, NCDValRef *out)
{
struct replace_instance *o = vo;
if (!strcmp(name, "")) {
*out = NCDVal_NewStringBinMr(mem, o->output);
return 1;
}
return 0;
}
static struct NCDModule modules[] = {
{
.type = "regex_match",
.func_new2 = func_new,
.func_getvar = func_getvar,
.alloc_size = sizeof(struct instance)
}, {
.type = "regex_replace",
.func_new2 = replace_func_new,
.func_die = replace_func_die,
.func_getvar = replace_func_getvar,
.alloc_size = sizeof(struct replace_instance)
}, {
.type = NULL
}
};
const struct NCDModuleGroup ncdmodule_regex_match = {
.modules = modules
};