6.23. Regular Expression Grab Bag
We have found these regular expressions
useful or interesting:
- Swap first two words
s/(\S+)(\s+)(\S+)/$3$2$1/
- Keyword = Value
m/^(\w+)\s*=\s*(.*?)\s*$/
# keyword is $1, value is $2- Line of at least 80 characters
m/.{80,}/
length( ) >= 80 # ok, not a regex- MM/DD/YY HH:MM:SS
m|(\d+)/(\d+)/(\d+) (\d+):(\d+):(\d+)|
- Changing directories
s(/usr/bin)(/usr/local/bin)g
- Expanding %7E (hex) escapes
s/%([0-9A-Fa-f][0-9A-Fa-f])/chr(hex($1))/ge
- Deleting C comments (imperfectly)
s{
/* # Match the opening delimiter
.*? # Match a minimal number of characters
*/ # Match the closing delimiter
}{ }gsx;- Removing leading and trailing whitespace
s/^\s+//;
s/\s+$//;- Turning \ followed by n into a real newline
s/\\n/\n/g;
- Removing package portion of fully qualified symbols
s/^.*:://
- Dotted quads (most IP addresses)
# XXX: fails on legal IPs 127.1 and 2130706433.
m{
^ ( \d | [01]?\d\d | 2[0-4]\d | 25[0-5] )
\. ( \d | [01]?\d\d | 2[0-4]\d | 25[0-5] )
\. ( \d | [01]?\d\d | 2[0-4]\d | 25[0-5] )
\. ( \d | [01]?\d\d | 2[0-4]\d | 25[0-5] )
$
}x- Removing leading path from filename
s{^.*/}{ }
- Extracting columns setting from TERMCAP
$cols = ( ($ENV{TERMCAP} || " ") =~ m/:co#(\d+):/ ) ? $1 : 80;
- Removing directory components from program name and arguments
($name = " $0 @ARGV") =~ s{ /\S+/}{ }g;
- Checking your operating system
die "This isn't Linux" unless $^O =~ m/linux/i;
- Joining continuation lines in multiline string
s/\n\s+/ /g
- Extracting all numbers from a string
@nums = m/(\d+\.?\d*|\.\d+)/g;
- Finding all-caps words
@capwords = m/(\b\p{ Upper-case Letter }+\b)/g;
- Finding all-lowercase words
@lowords = m/(\b\p{ Lower-case Letter }+\b)/g;
- Finding initial-caps word
@icwords = m{
( \b
[\p{ Upper-case Letter }\p{ Title-case Letter }]
\p{ Lower-case Letter } *
\b )
}gx;- Finding links in simple HTML
@links = m/<A[^>]+?HREF\s*=\s*["']?([^'" >]+?)['"]?\s*>/ig;
- Finding middle initial in $_
$initial = /^\S+\s+(\S)\S*\s+\S/ ? $1 : ";
- Changing double verticle prime pairs to curly quotes
s/"([^"]*)"/``$1'/g # old way
# next is unicode only
s/"([^"]*)"/\x{201C}\x{201C}$1\x{201D}\x{201D}/g- Extracting sentences (double spaces required between each)
{ local $/ = ";
while (<>) {
s/\n/ /g;
s/ {3,}/ /g;
push @sentences, m/(\S.*?[!?.])(?= {2}|\Z)/g;
}
}- YYYY-MM-DD
m/\b(\d{4})-(\d\d)-(\d\d)\b/
# YYYY in $1, MM in $2, DD in $3- North American telephone numbers
m/ ^
(?:
1 \s (?: \d\d\d \s)? # 1, or 1 and area code
| # ... or ...
\(\d\d\d\) \s # area code with parens
| # ... or ...
(?: \+\d\d?\d? \s)? # optional +country code
\d\d\d ([\s\-]) # and area code
)
\d\d\d (\s|\1) # prefix (and area code separator)
\d\d\d\d # exchange
$
/x- Exclamations
m/\boh\s+my\s+gh?o(d(dess(es)?|s?)|odness|sh)\b/i
- Extracting lines regardless of line terminator
push(@lines, $1) while $input =~ s{
^ # gobble from front
(
. # begin $1: any single char (/s)
?* # but minimally matching even none
)
(?: # make capturing if saving terminators
\x0D \x0A # CRLF
| \x0A # LF
| \x0D # CR
| \x0C # FF
# (see http://www.unicode.org/reports/tr13/tr13-9l)
| \x{2028} # Unicode LS
| \x{2029} # Unicode PS
)
}{ }sx; # consumes $input
Or use split:@lines = split m{
(?: # make capturing if saving terminators
\x0D \x0A # CRLF
| \x0A # LF
| \x0D # CR
| \x0C # FF
# (see http://www.unicode.org/reports/tr13/tr13-9l)
| \x{2028} # Unicode LS
| \x{2029} # Unicode PS
)
}x, $input;