python中re模块详细教程
Python 正则表达式 re 模块语法参考
目录
- re模块概述
- 基本函数
- 正则表达式语法
- 常用模式
- 实际应用示例
- 性能优化
- 常见问题
re模块概述
Python的re
模块提供了正则表达式的支持,是处理字符串模式匹配的强大工具。
导入模块
import re
基本函数
1. re.match()
从字符串开头匹配模式
import re# 基本用法
result = re.match(r'hello', 'hello world')
print(result.group()) # 输出: hello# 使用组
result = re.match(r'(\w+) (\w+)', 'hello world')
print(result.groups()) # 输出: ('hello', 'world')
print(result.group(1)) # 输出: hello
print(result.group(2)) # 输出: world
2. re.search()
在字符串中搜索模式
import re# 基本搜索
result = re.search(r'\d+', 'abc123def')
print(result.group()) # 输出: 123# 搜索邮箱
text = "联系我:user@example.com 或 admin@test.org"
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
result = re.search(email_pattern, text)
print(result.group()) # 输出: user@example.com
3. re.findall()
查找所有匹配项
import re# 查找所有数字
text = "价格是123元,数量是456个"
numbers = re.findall(r'\d+', text)
print(numbers) # 输出: ['123', '456']# 查找所有邮箱
text = "联系我:user@example.com 或 admin@test.org"
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
print(emails) # 输出: ['user@example.com', 'admin@test.org']
4. re.finditer()
返回迭代器,包含所有匹配对象
import retext = "价格是123元,数量是456个"
for match in re.finditer(r'\d+', text):print(f"找到数字: {match.group()} 位置: {match.span()}")
# 输出:
# 找到数字: 123 位置: (3, 6)
# 找到数字: 456 位置: (10, 13)
5. re.sub()
替换匹配的字符串
import re# 基本替换
text = "hello world"
new_text = re.sub(r'world', 'python', text)
print(new_text) # 输出: hello python# 使用函数进行替换
def replace_func(match):return str(int(match.group()) * 2)text = "价格是123元,数量是456个"
new_text = re.sub(r'\d+', replace_func, text)
print(new_text) # 输出: 价格是246元,数量是912个
6. re.split()
根据模式分割字符串
import re# 按数字分割
text = "abc123def456ghi"
parts = re.split(r'\d+', text)
print(parts) # 输出: ['abc', 'def', 'ghi']# 保留分隔符
parts = re.split(r'(\d+)', text)
print(parts) # 输出: ['abc', '123', 'def', '456', 'ghi']
7. re.compile()
编译正则表达式对象
import re# 编译正则表达式
pattern = re.compile(r'\d+')# 使用编译后的对象
text = "abc123def456ghi"
result = pattern.findall(text)
print(result) # 输出: ['123', '456']# 使用编译后的对象进行替换
new_text = pattern.sub('NUMBER', text)
print(new_text) # 输出: abcNUMBERdefNUMBERghi
正则表达式语法
字符类
import re# 字符集合
pattern = r'[abc]' # 匹配 a、b 或 c
text = "apple banana cherry"
matches = re.findall(pattern, text)
print(matches) # 输出: ['a', 'a', 'a', 'a', 'a', 'a']# 范围
pattern = r'[a-z]' # 匹配小写字母
pattern = r'[A-Z]' # 匹配大写字母
pattern = r'[0-9]' # 匹配数字
pattern = r'[a-zA-Z0-9]' # 匹配字母和数字# 否定字符集
pattern = r'[^abc]' # 匹配除了 a、b、c 之外的字符
预定义字符类
import re# 常用预定义字符类
patterns = {r'\d': '数字 [0-9]',r'\D': '非数字 [^0-9]',r'\w': '单词字符 [a-zA-Z0-9_]',r'\W': '非单词字符 [^a-zA-Z0-9_]',r'\s': '空白字符 [\t\n\r\f\v]',r'\S': '非空白字符 [^\t\n\r\f\v]',r'.': '任意字符(除换行符外)'
}text = "Hello 123 World!"
for pattern, description in patterns.items():matches = re.findall(pattern, text)print(f"{description}: {matches}")
量词
import re# 基本量词
patterns = {r'a*': '匹配 a 0次或多次',r'a+': '匹配 a 1次或多次',r'a?': '匹配 a 0次或1次',r'a{3}': '匹配 a 恰好3次',r'a{3,}': '匹配 a 至少3次',r'a{3,5}': '匹配 a 3到5次'
}text = "aa aaa aaaa aaaaa"
for pattern, description in patterns.items():matches = re.findall(pattern, text)print(f"{description}: {matches}")
贪婪与非贪婪
import retext = "<div>content1</div><div>content2</div>"# 贪婪匹配(默认)
greedy_pattern = r'<div>.*</div>'
greedy_match = re.search(greedy_pattern, text)
print("贪婪匹配:", greedy_match.group())# 非贪婪匹配
non_greedy_pattern = r'<div>.*?</div>'
non_greedy_matches = re.findall(non_greedy_pattern, text)
print("非贪婪匹配:", non_greedy_matches)
位置锚点
import re# 行首行尾
text = "hello\nworld\nhello python"
pattern = r'^hello' # 匹配行首的hello
matches = re.findall(pattern, text, re.MULTILINE)
print("行首匹配:", matches)pattern = r'hello$' # 匹配行尾的hello
matches = re.findall(pattern, text, re.MULTILINE)
print("行尾匹配:", matches)# 单词边界
text = "word wordly sword"
pattern = r'\bword\b' # 匹配独立的word
matches = re.findall(pattern, text)
print("单词边界匹配:", matches)
分组和引用
import re# 捕获组
text = "John Doe, Jane Smith"
pattern = r'(\w+) (\w+)'
matches = re.findall(pattern, text)
print("捕获组:", matches) # 输出: [('John', 'Doe'), ('Jane', 'Smith')]# 命名组
pattern = r'(?P<first>\w+) (?P<last>\w+)'
match = re.search(pattern, text)
if match:print("命名组:", match.groupdict())# 反向引用
text = "hello hello world world"
pattern = r'(\w+) \1' # 匹配重复的单词
matches = re.findall(pattern, text)
print("反向引用:", matches)
断言
import re# 正向先行断言
text = "apple pie, apple juice, apple"
pattern = r'apple(?=\s+(?:pie|juice))'
matches = re.findall(pattern, text)
print("正向先行断言:", matches) # 输出: ['apple', 'apple']# 负向先行断言
pattern = r'apple(?!\s+(?:pie|juice))'
matches = re.findall(pattern, text)
print("负向先行断言:", matches) # 输出: ['apple']# 正向后行断言
text = "123apple, 456apple, apple"
pattern = r'(?<=\d+)apple'
matches = re.findall(pattern, text)
print("正向后行断言:", matches) # 输出: ['apple', 'apple']
常用模式
邮箱验证
import redef validate_email(email):pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'return bool(re.match(pattern, email))# 测试
emails = ["user@example.com","invalid-email","user@domain","user.name@domain.co.uk"
]for email in emails:print(f"{email}: {validate_email(email)}")
手机号验证(中国大陆)
import redef validate_phone(phone):pattern = r'^1[3-9]\d{9}$'return bool(re.match(pattern, phone))# 测试
phones = ["13812345678","12345678901","1381234567","23812345678"
]for phone in phones:print(f"{phone}: {validate_phone(phone)}")
身份证号验证(中国大陆)
import redef validate_id_card(id_card):pattern = r'^[1-9]\d{5}(18|19|20)\d{2}((0[1-9])|(1[0-2]))(([0-2][1-9])|10|20|30|31)\d{3}[0-9Xx]$'return bool(re.match(pattern, id_card))# 测试
id_cards = ["110101199001011234","123456789012345","11010119900101123X"
]for id_card in id_cards:print(f"{id_card}: {validate_id_card(id_card)}")
URL验证
import redef validate_url(url):pattern = r'^https?://[^\s/$.?#].[^\s]*$'return bool(re.match(pattern, url))# 测试
urls = ["https://www.example.com","http://example.com/path","ftp://example.com","not-a-url"
]for url in urls:print(f"{url}: {validate_url(url)}")
密码强度验证
import redef validate_password_strength(password):# 至少8位,包含大小写字母、数字和特殊字符pattern = r'^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$'return bool(re.match(pattern, password))# 测试
passwords = ["MyPass123!","weak","NoSpecialChar123","nouppercase123!"
]for password in passwords:print(f"{password}: {validate_password_strength(password)}")
实际应用示例
1. 提取HTML标签内容
import redef extract_html_content(html):pattern = r'<[^>]*>([^<]*)</[^>]*>'matches = re.findall(pattern, html)return matches# 测试
html = "<div>Hello World</div><p>Test</p><span>Content</span>"
contents = extract_html_content(html)
print("HTML内容:", contents)
2. 格式化电话号码
import redef format_phone_number(phone):# 移除所有非数字字符clean_phone = re.sub(r'\D', '', phone)# 格式化为中国手机号格式if len(clean_phone) == 11:return re.sub(r'(\d{3})(\d{4})(\d{4})', r'\1-\2-\3', clean_phone)return phone# 测试
phones = ["13812345678","138-1234-5678","138 1234 5678","138.1234.5678"
]for phone in phones:formatted = format_phone_number(phone)print(f"{phone} -> {formatted}")
3. 解析日志文件
import re
from datetime import datetimedef parse_log_line(log_line):# 假设日志格式: [2023-01-01 12:00:00] INFO: User login successfulpattern = r'\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\] (\w+): (.+)'match = re.match(pattern, log_line)if match:timestamp, level, message = match.groups()return {'timestamp': datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S'),'level': level,'message': message}return None# 测试
log_lines = ["[2023-01-01 12:00:00] INFO: User login successful","[2023-01-01 12:01:15] ERROR: Database connection failed","Invalid log line"
]for line in log_lines:parsed = parse_log_line(line)print(f"原始: {line}")print(f"解析: {parsed}")print()
4. 数据清洗
import redef clean_text(text):# 移除多余的空白字符text = re.sub(r'\s+', ' ', text)# 移除特殊字符(保留中文、英文、数字)text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s]', '', text)# 移除行首行尾空白text = text.strip()return text# 测试
dirty_text = " Hello World! @#$%^&*() \n\n 测试文本 "
clean = clean_text(dirty_text)
print(f"原始文本: '{dirty_text}'")
print(f"清洗后: '{clean}'")
5. 提取CSV数据
import redef parse_csv_line(line):# 处理包含逗号的字段(用双引号包围)pattern = r'"([^"]*)"|([^,]+)'matches = re.findall(pattern, line)# 提取匹配的内容fields = []for match in matches:field = match[0] if match[0] else match[1]fields.append(field.strip())return fields# 测试
csv_lines = ['name,age,city','John,25,"New York, NY"','Jane,30,"Los Angeles, CA"','Bob,35,Chicago'
]for line in csv_lines:fields = parse_csv_line(line)print(f"原始: {line}")print(f"解析: {fields}")print()
性能优化
1. 使用编译后的正则表达式
import re
import time# 未编译版本
def test_uncompiled(text, iterations=10000):start = time.time()for _ in range(iterations):re.findall(r'\d+', text)return time.time() - start# 编译版本
def test_compiled(text, iterations=10000):pattern = re.compile(r'\d+')start = time.time()for _ in range(iterations):pattern.findall(text)return time.time() - start# 测试性能
text = "abc123def456ghi789" * 100
uncompiled_time = test_uncompiled(text)
compiled_time = test_compiled(text)print(f"未编译时间: {uncompiled_time:.4f}秒")
print(f"编译时间: {compiled_time:.4f}秒")
print(f"性能提升: {uncompiled_time/compiled_time:.2f}倍")
2. 避免回溯灾难
import re# 不好的写法(可能导致回溯灾难)
bad_pattern = r'(a+)+b'# 好的写法
good_pattern = r'a+b'# 测试
text = 'a' * 1000 + 'b'try:# 这可能会很慢re.match(bad_pattern, text)
except:print("回溯灾难发生")# 这个会很快
re.match(good_pattern, text)
常见问题
1. 转义字符问题
import re# 错误:忘记转义
# pattern = r'\d+' # 在字符串中需要双反斜杠# 正确:使用原始字符串
pattern = r'\d+'# 或者使用双反斜杠
pattern = '\\d+'
2. 贪婪匹配问题
import retext = "<div>content1</div><div>content2</div>"# 贪婪匹配(可能不是期望的结果)
greedy = re.findall(r'<div>.*</div>', text)
print("贪婪匹配:", greedy)# 非贪婪匹配(通常更符合期望)
non_greedy = re.findall(r'<div>.*?</div>', text)
print("非贪婪匹配:", non_greedy)
3. 多行模式
import retext = """line1
line2
line3"""# 不使用多行模式
matches = re.findall(r'^line', text)
print("不使用多行模式:", matches)# 使用多行模式
matches = re.findall(r'^line', text, re.MULTILINE)
print("使用多行模式:", matches)
4. 大小写敏感
import retext = "Hello WORLD"# 区分大小写(默认)
matches = re.findall(r'hello', text)
print("区分大小写:", matches)# 忽略大小写
matches = re.findall(r'hello', text, re.IGNORECASE)
print("忽略大小写:", matches)
本文档提供了Python re模块的完整语法参考和实用示例,建议在实际使用中根据具体需求进行调整和优化。