| (1) get a group from a string
|
import re
text = "Asian.lst"
result = re.search(r"(.*)\.lst", text)
if result:
filename = result.group(1)
print("filename:", filename)
|
import std.stdio;
import std.regex;
void main()
{
string text = "Asian.lst";
auto result = matchFirst(text, regex(r"(.*)\.lst"));
if (result)
{
string filename = result.captures[1]; // capture group 1
// string filename = result[1]; // shorter version
writeln("filename: ", filename);
}
}
|
Output:
| (2) match a string against a regexp
|
import re
text = "Asian.lst"
result = re.search(r"ian", text)
if result:
print("contains 'ian'")
|
import std.stdio;
import std.regex;
void main()
{
string text = "Asian.lst";
auto result = matchFirst(text, regex(r"ian"));
if (result)
{
writeln("contains 'ian'");
}
}
|
Output:
| (3) find all the occurences of a substring in a string
|
import re
text = """
<a href="ad1">sdqs</a>
<a href="ad2">sds</a>
<a href=ad3>qs</a>
"""
m = re.findall(r'href="?(.*?)"?>', text)
print(m) # ['ad1', 'ad2', 'ad3']
m = re.findall(r"ad\d", text)
print(m) # ['ad1', 'ad2', 'ad3']
|
import std.stdio;
import std.regex;
import std.algorithm;
import std.array;
const text = `
<a href="ad1">sdqs</a>
<a href="ad2">sds</a>
<a href=ad3>qs</a>
`;
string[] findAll(const string re, const string text)
{
auto matches = text.matchAll(regex(re));
auto all = matches.array;
if (all.length == 0) // no matches → return empty list
{
return [];
}
int groups = cast(int) all[0].length - 1; // count capture groups
switch (groups)
{
case 0:
return text.matchAll(regex(re)).map!(m => m.hit).array;
case 1:
return text.matchAll(regex(re)).map!(m => m[1]).array;
default:
assert(0, "Error: using more than 1 capture group is not implemented.");
}
}
void main()
{
auto li = findAll(`href="?(.*?)"?>`, text);
writeln(li); // ["ad1", "ad2", "ad3"]
li = findAll(`ad\d`, text);
writeln(li); // ["ad1", "ad2", "ad3"]
}
|
Problem with backreferences
The std.regex package in the stdlib is not perfect. Backreferences don't work correctly :( I ran into a problem that exists since 2015…
On Discord, Paul Backus summarized it as follows: "It's specifically the combination of (1) a backreference, (2) with a .* in front of it, (3) with "extra" characters at the start of the string and between the two parts that are supposed to match."
Links:
Example with a workaround:
import std.regex;
import std.stdio;
void main()
{
string text = "baacaa";
// auto result = matchFirst(text, regex(r"(..).*\1")); // buggy
auto result = matchFirst(text, regex(r"(..).{0,999999}\1")); // workaround
if (result) {
writeln(text);
}
}
The workaround was posted by pbackus (thanks!). His comments: "As a workaround, you can replace .* with {0,N} for some large value of N".
| (4) re.sub()
|
import re
def main():
s = "vmi jabba@gmail.com valami leia@gmail.com"
result = re.sub(r"([\w.]+)@[\w.]+", r"\1@tattoine.com", s)
print(result)
|
import std.stdio;
import std.regex;
void main()
{
auto s = "vmi jabba@gmail.com valami leia@gmail.com";
// the replacement uses $1 instead of \1
auto result = s.replaceAll(regex(r"([\w.]+)@[\w.]+"), r"$1@tattoine.com");
writeln(result);
}
|
Notice that in D, when you call s.replaceAll(), the first argument is a regex object, while the second argument is a normal string.
Output:
vmi jabba@tattoine.com valami leia@tattoine.com