Perl Cd Bookshelf [Electronic resources]

نسخه متنی -صفحه : 875/ 500

6.23. Regular Expression Grab Bag

We have found these regular expressions useful or interesting:

Swap first two words

s/(\S+)(\s+)(\S+)/$3$2$1/

Keyword = Value

m/^(\w+)\s*=\s*(.*?)\s*$/     
        # keyword is $1, value is $2

Line of at least 80 characters

m/.{80,}/
length( ) >= 80        # ok, not a regex

MM/DD/YY HH:MM:SS

m|(\d+)/(\d+)/(\d+) (\d+):(\d+):(\d+)|

Changing directories

s(/usr/bin)(/usr/local/bin)g

Expanding %7E (hex) escapes

s/%([0-9A-Fa-f][0-9A-Fa-f])/chr(hex($1))/ge

Deleting C comments (imperfectly)

s{
/*                    # Match the opening delimiter
.*?                   # Match a minimal number of characters
*/                    # Match the closing delimiter
}{  }gsx;

Removing leading and trailing whitespace

s/^\s+//;
s/\s+$//;

Turning \ followed by n into a real newline

s/\\n/\n/g;

Removing package portion of fully qualified symbols

s/^.*:://

Dotted quads (most IP addresses)

# XXX: fails on legal IPs 127.1 and 2130706433.
m{
^  ( \d | [01]?\d\d | 2[0-4]\d | 25[0-5] )
\.  ( \d | [01]?\d\d | 2[0-4]\d | 25[0-5] )
\.  ( \d | [01]?\d\d | 2[0-4]\d | 25[0-5] )
\.  ( \d | [01]?\d\d | 2[0-4]\d | 25[0-5] )
$
}x

Removing leading path from filename

s{^.*/}{  }

Extracting columns setting from TERMCAP

$cols = ( ($ENV{TERMCAP} || " ") =~ m/:co#(\d+):/ ) ? $1 : 80;

Removing directory components from program name and arguments

($name = " $0 @ARGV") =~ s{ /\S+/}{ }g;

Checking your operating system

die "This isn't Linux" unless $^O =~ m/linux/i;

Joining continuation lines in multiline string

s/\n\s+/ /g

Extracting all numbers from a string

@nums = m/(\d+\.?\d*|\.\d+)/g;

Finding all-caps words

@capwords = m/(\b\p{ Upper-case Letter }+\b)/g;

Finding all-lowercase words

@lowords = m/(\b\p{ Lower-case Letter }+\b)/g;

Finding initial-caps word

@icwords = m{
( \b
[\p{ Upper-case Letter }\p{ Title-case Letter }]
\p{  Lower-case Letter } *
\b )
}gx;

Finding links in simple HTML

@links = m/<A[^>]+?HREF\s*=\s*["']?([^'" >]+?)['"]?\s*>/ig;

Finding middle initial in $_

$initial = /^\S+\s+(\S)\S*\s+\S/ ? $1 : ";

Changing double verticle prime pairs to curly quotes

s/"([^"]*)"/``$1'/g # old way
# next is unicode only
s/"([^"]*)"/\x{201C}\x{201C}$1\x{201D}\x{201D}/g

Extracting sentences (double spaces required between each)

{ local $/ = ";
while (<>) {
s/\n/ /g;
s/ {3,}/  /g;
push @sentences, m/(\S.*?[!?.])(?= {2}|\Z)/g;
}
}

YYYY-MM-DD

m/\b(\d{4})-(\d\d)-(\d\d)\b/       
    # YYYY in $1, MM in $2, DD in $3

North American telephone numbers

m/ ^
(?:
1 \s (?: \d\d\d \s)?            # 1, or 1 and area code
|                               # ... or ...
\(\d\d\d\) \s                   # area code with parens
|                               # ... or ...
(?: \+\d\d?\d? \s)?             # optional +country code
\d\d\d ([\s\-])                 # and area code
)
\d\d\d (\s|\1)                   # prefix (and area code separator)
\d\d\d\d                         # exchange
$
/x

Exclamations

m/\boh\s+my\s+gh?o(d(dess(es)?|s?)|odness|sh)\b/i

Extracting lines regardless of line terminator

push(@lines, $1) while $input =~ s{
^                                # gobble from front
(     
.                         # begin $1: any single char (/s)
?*                         # but minimally matching even none
)                        
(?:                           # make capturing if saving terminators
\x0D \x0A                # CRLF        
|   \x0A                 # LF
|   \x0D                # CR
|   \x0C                # FF
# (see http://www.unicode.org/reports/tr13/tr13-9l)
|   \x{2028}                # Unicode LS
|   \x{2029}                # Unicode PS
)
}{  }sx;                        # consumes $input

Or use split:

@lines = split m{
(?:                           # make capturing if saving terminators
\x0D \x0A                # CRLF        
|   \x0A                 # LF
|   \x0D                # CR
|   \x0C                # FF
# (see http://www.unicode.org/reports/tr13/tr13-9l)
|   \x{2028}                # Unicode LS
|   \x{2029}                # Unicode PS
)
}x, $input;