分類： Perl

精簡扼要的 Perl 課程講義（六）：常規表達式（Regular Expression）

常規表達式（一）

# 常規表達式(一) (Regular expression)

# (1) 基本樣式比對 "=~" 與 "!~"

# 比對字串，成功傳回 true
# 失敗傳回 false
"Hello World" =~ /World/;

$string = "Hello World!";

# 若比對成功，則 print
print "It matches\n" if $string =~ /World/;

# 若比對失敗，則 print
print "It doesn't match\n" if $string !~ /World/;

$_ = "Hello World";

# 不指定比對目標，預設為 $_
print "It matches\n" if /World/;

# 大小寫不同，比對失敗
"Hello World" =~ /world/;

# 空白字元也視為一般字元，比對成功
"Hello World" =~ /o W/;

# 比對失敗
"Hello World" =~ /World /;


# (2) 自訂分隔字元

# 等同於 "Hello World" =~ /World/;
"Hello World" =~ m!World!;

"Hello World" =~ m{World};  # The same

# 比對成功，'/' 現在變成一般字元
"/usr/bin/perl" =~ m"/perl";


# (3) 中介字符 (metacharacter) 與字符集 (character class)

# ^：比對行首

"Hello World" =~ /^Hello/;  # 比對成功

# 比對失敗，因為 World 不在行首
"Hello World" =~ /^World/;


# $：比對行尾

"Hello World" =~ /World$/;  # 比對成功

# 比對失敗，因為 World 不在行尾
"Hello World" =~ /Hello$/;


# .：比對除了換行(\n)以外的任意一個字元

"Hello World" =~ /Wo.ld/;   # 比對成功


# *：比對其前一個項目零次以上 (as many as possible)
# +：比對其前一個項目一次以上 (as many as possible)
# ?：比對其前一個項目零次或一次 (as many as possible)

"Hello World" =~ /Hel*o/;   # l* 代表 'l' 零次以上，比對成功

"Hello World" =~ /Hel+o/;   # l* 代表 'l' 一次以上，比對成功

# .* 代表 任意字元零次以上，比對成功，
# 但 Perl 所比對的結果是：
# 'That is a cat, not a hat' 不是 'That'
"That is a cat, not a hat." =~ /T.*at/;

# 加入 '?' 使比對到的資料越短越好，
# 此時 Perl 所比對的結果是：'That'
"That is a cat, not a hat." =~ /T.*?at/;


# []：比對中括號中任意一個字符

# 比對開頭是 A 或 B 或 C，比對失敗
"Hat" =~ /^[ABC]/;

# 比對開頭是 A 或 B 或 C 或 D，比對成功
"Cat" =~ /^[A-D]/;


# [^]：與 [] 相反，比對不在中括號中任意一個字符

# ^[^ABC] 比對開頭不是 A 或 B 或 C 的，比對成功
"Hat" =~ /^[^ABC]/;

# 比對非英文字結尾，比對成功
"Hello World." =~ /[^A-Za-z]$/;


# |：比對任意一組字符

# 比對 cat 或 dog，比對成功
"Mary has a cat." =~ /cat|dog/;


# {}：指定前一個項目出現的次數

# l{1,3} 代表 'l' 一到三次，比對成功
"Helllo World" =~ /Hel{1,3}o/;

# l{2,} 代表 'l' 兩次以上，比對成功
"Helllo World" =~ /Hel{2,}o/;

# l{2} 代表 'l' 兩次，比對失敗
"Helllo World" =~ /Hel{2}o/;


# \b：比對單字邊界
# \B：比對非單字邊界

# 比對 Hello 這個單字，比對成功
"Hello World" =~ /\bHello\b/;

# 比對 Hello 這個單字，比對失敗
"Helloworld" =~ /\bHello\b/;

# 比對成功
"Helloworld" =~ /\bHello\B/;

# 比對 word 這個單字，比對成功
'This is a "word".' =~ /\bword\b/;


# \w：word      [a-zA-Z0-9_]
# \W：non-word      [^a-zA-Z0-9_]
# \s：space     [ \t\n\r\f\v]
# \S：non-space     [^ \t\n\r\f\v]
# \d：digit     [0-9]
# \D：non-digit     [^0-9]

"Hello World" =~ /^\w+\W\w+$/;  # 比對成功

# Reference : perlre(1)

常規表達式（二）

# 常規表達式(二)
# (1) s/// 取代
$_ = "Hello World.\n";
s/World/Bill/;  # 取代 Hello 成 Bill
print;  # Hello Bill.

$_ = "Hello World.\n";
$word = "World";
s/$word/Bill/;  # 可內嵌變數
print;


# (2) 使用 () 儲存變數
$_ = "Every Dog Has It's Day.";

# 比對行首的第一個與第二個字，並儲存至變數 $1 與 $2
/^(\w+)\W+(\w+)/;

print "The first 2 words are: $1 and $2";

$_ = "Every Dog Has It's Day.";
s/(\w+)/<$1>/g;
print;  # <Every> <Dog> <Has> <It>'<s> <Day>.

$_ = "barbarian";
s/(\w+)\1/$1/;  # 抓出重複的地方，去掉重複的地方
print;  # barian


# (3) Modifiers

# g：Match globally, i.e., find all occurrences.
$_ = "Hello World.\n";
s/l/<L>/;   # 只取代第一個 'l'
print;      # He<L>lo World.

$_ = "Hello World.\n";
s/l/<L>/g;  # 取代所有的 'l'
print;      # He<L><L>o Wor<L>d.

# i：Do case-insensitive pattern matching.
# 忽略大小寫差別，比對成功
"Hello World" =~ /hello world/i;
# o：Compile pattern only once.
$word = "something";
while($something){

    # something ...

    s/$word/$another/o; # 加入 'o'，只編譯一次，可加快執行效率

    # something ...
}

$str = "abcdefg\n";
$str =~ s/($_)/<$1>/o for('c' .. 'f');
print $str;     # ab<<<<c>>>>defg ，似乎不是我要的


# m：Treat string as mutiple lines.
# That is, change "^" and "$" from matching the
# start or end of the string to matching the start
# or end of any line anywhere within the string.

$_ = "abc\ndef\nghi\n";

s/^(.)/\u$1/mg; # 把行首第一個字母變大寫

print;  # 結果正確，"Abc\nDef\nGhi"


$_ = "abc\ndef\nghi\n";

s/^(.)/\u$1/g;  # 若沒有加上 m

print;  # 結果變成，"Abc\ndef\nghi"


$_ = "abc\ndef\nghi\n";

s/(.)$/\u$1/mg; # 同理

print;  # 結果為，"abC\ndeF\nghI"
# s：Treat string as single line. That is,
# change "." to match any character whatsoever,
# even a newline, which normally it would not match.

$_ = "abc\ndef\nghi\n";

# 加入 s ，使 '.' 可比對 "\n"，比對成功。
print "Matched" if /a.*i/s;

# 不加 s ，則比對失敗。
print "Matched" if /a.*i/;