正则表达式 Python for Data Analysis 笔记

import re

# 描述一个或多个空白符的regex是\s+
text = "foo bar\t baz \tqux"

regex = re.compile('\s+')
print(regex.split(text))  #  等于 re.split('\s+',text)
# ['foo', 'bar', 'baz', 'qux']

print(regex.findall(text)) # 匹配到的模式
#[' ', '\t ', ' \t']

text = """
Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern,flags=re.IGNORECASE) # re.IGNORECASE 忽略大小写
print(regex.findall(text))
# ['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

# search返回第一个,只告诉原字符串中的起始和结束位置
m = regex.search(text)
print(m) # <re.Match object; span=(6, 21), match='dave@google.com'>
print(text[m.start():m.end()]) # dave@google.com

# sub 将匹配到的字符串替换为指定字符串
print(regex.sub("REDACTED",text))
'''
Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED
'''

pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern,flags=re.IGNORECASE) # re.IGNORECASE 忽略大小写
m = regex.match('wesm@bright.com')
print(m.groups()) # ('wesm', 'bright', 'com')
print(regex.findall(text))
'''
[('dave', 'google', 'com'), 
('steve', 'gmail', 'com'), 
('rob', 'gmail', 'com'), 
('ryan', 'yahoo', 'com')]
'''
# sub可以通过\1、\2之类的特殊符号访问各匹配项中的分组
print(regex.sub(r'Username:\1, Domain:\2, Suffix:\3',text))
'''
Dave Username:dave, Domain:google, Suffix:com
Steve Username:steve, Domain:gmail, Suffix:com
Rob Username:rob, Domain:gmail, Suffix:com
Ryan Username:ryan, Domain:yahoo, Suffix:com
'''

# 这种正则表达式所产生的对象可以得到一个简单易用的带有分组名称的字典
pattern = r"""
(?P<username>[A-Z0-9._%+-]+)
@
(?P<Domain>[A-Z0-9.-]+)
\.
(?P<Suffix>[A-Z]{2,4})
"""
regex = re.compile(pattern,flags=re.IGNORECASE|re.VERBOSE) # re.IGNORECASE 忽略大小写
m = regex.match('wesm@bright.com')
print(m.groupdict())
'''
{'username': 'wesm', 'Domain': 'bright', 'Suffix': 'com'}
'''

 

上一篇:1. 爬虫基础


下一篇:用正则表达式判断是不是正确的IP地址的代码