当前位置：首页 > news >正文

python中re模块详细教程

news 2025/8/11 14:11:09

Python 正则表达式 re 模块语法参考

re模块概述

Python的re模块提供了正则表达式的支持，是处理字符串模式匹配的强大工具。

导入模块

import re

基本函数

1. re.match()

从字符串开头匹配模式

import re# 基本用法
result = re.match(r'hello', 'hello world')
print(result.group())  # 输出: hello# 使用组
result = re.match(r'(\w+) (\w+)', 'hello world')
print(result.groups())  # 输出: ('hello', 'world')
print(result.group(1))  # 输出: hello
print(result.group(2))  # 输出: world

2. re.search()

在字符串中搜索模式

import re# 基本搜索
result = re.search(r'\d+', 'abc123def')
print(result.group())  # 输出: 123# 搜索邮箱
text = "联系我：user@example.com 或 admin@test.org"
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
result = re.search(email_pattern, text)
print(result.group())  # 输出: user@example.com

3. re.findall()

查找所有匹配项

import re# 查找所有数字
text = "价格是123元，数量是456个"
numbers = re.findall(r'\d+', text)
print(numbers)  # 输出: ['123', '456']# 查找所有邮箱
text = "联系我：user@example.com 或 admin@test.org"
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
print(emails)  # 输出: ['user@example.com', 'admin@test.org']

4. re.finditer()

返回迭代器，包含所有匹配对象

import retext = "价格是123元，数量是456个"
for match in re.finditer(r'\d+', text):print(f"找到数字: {match.group()} 位置: {match.span()}")
# 输出:
# 找到数字: 123 位置: (3, 6)
# 找到数字: 456 位置: (10, 13)

5. re.sub()

替换匹配的字符串

import re# 基本替换
text = "hello world"
new_text = re.sub(r'world', 'python', text)
print(new_text)  # 输出: hello python# 使用函数进行替换
def replace_func(match):return str(int(match.group()) * 2)text = "价格是123元，数量是456个"
new_text = re.sub(r'\d+', replace_func, text)
print(new_text)  # 输出: 价格是246元，数量是912个

6. re.split()

根据模式分割字符串

import re# 按数字分割
text = "abc123def456ghi"
parts = re.split(r'\d+', text)
print(parts)  # 输出: ['abc', 'def', 'ghi']# 保留分隔符
parts = re.split(r'(\d+)', text)
print(parts)  # 输出: ['abc', '123', 'def', '456', 'ghi']

7. re.compile()

编译正则表达式对象

import re# 编译正则表达式
pattern = re.compile(r'\d+')# 使用编译后的对象
text = "abc123def456ghi"
result = pattern.findall(text)
print(result)  # 输出: ['123', '456']# 使用编译后的对象进行替换
new_text = pattern.sub('NUMBER', text)
print(new_text)  # 输出: abcNUMBERdefNUMBERghi

正则表达式语法

字符类

import re# 字符集合
pattern = r'[abc]'  # 匹配 a、b 或 c
text = "apple banana cherry"
matches = re.findall(pattern, text)
print(matches)  # 输出: ['a', 'a', 'a', 'a', 'a', 'a']# 范围
pattern = r'[a-z]'  # 匹配小写字母
pattern = r'[A-Z]'  # 匹配大写字母
pattern = r'[0-9]'  # 匹配数字
pattern = r'[a-zA-Z0-9]'  # 匹配字母和数字# 否定字符集
pattern = r'[^abc]'  # 匹配除了 a、b、c 之外的字符

预定义字符类

import re# 常用预定义字符类
patterns = {r'\d': '数字 [0-9]',r'\D': '非数字 [^0-9]',r'\w': '单词字符 [a-zA-Z0-9_]',r'\W': '非单词字符 [^a-zA-Z0-9_]',r'\s': '空白字符 [\t\n\r\f\v]',r'\S': '非空白字符 [^\t\n\r\f\v]',r'.': '任意字符（除换行符外）'
}text = "Hello 123 World!"
for pattern, description in patterns.items():matches = re.findall(pattern, text)print(f"{description}: {matches}")

量词

import re# 基本量词
patterns = {r'a*': '匹配 a 0次或多次',r'a+': '匹配 a 1次或多次',r'a?': '匹配 a 0次或1次',r'a{3}': '匹配 a 恰好3次',r'a{3,}': '匹配 a 至少3次',r'a{3,5}': '匹配 a 3到5次'
}text = "aa aaa aaaa aaaaa"
for pattern, description in patterns.items():matches = re.findall(pattern, text)print(f"{description}: {matches}")

贪婪与非贪婪

import retext = "<div>content1</div><div>content2</div>"# 贪婪匹配（默认）
greedy_pattern = r'<div>.*</div>'
greedy_match = re.search(greedy_pattern, text)
print("贪婪匹配:", greedy_match.group())# 非贪婪匹配
non_greedy_pattern = r'<div>.*?</div>'
non_greedy_matches = re.findall(non_greedy_pattern, text)
print("非贪婪匹配:", non_greedy_matches)

位置锚点

import re# 行首行尾
text = "hello\nworld\nhello python"
pattern = r'^hello'  # 匹配行首的hello
matches = re.findall(pattern, text, re.MULTILINE)
print("行首匹配:", matches)pattern = r'hello$'  # 匹配行尾的hello
matches = re.findall(pattern, text, re.MULTILINE)
print("行尾匹配:", matches)# 单词边界
text = "word wordly sword"
pattern = r'\bword\b'  # 匹配独立的word
matches = re.findall(pattern, text)
print("单词边界匹配:", matches)

分组和引用

import re# 捕获组
text = "John Doe, Jane Smith"
pattern = r'(\w+) (\w+)'
matches = re.findall(pattern, text)
print("捕获组:", matches)  # 输出: [('John', 'Doe'), ('Jane', 'Smith')]# 命名组
pattern = r'(?P<first>\w+) (?P<last>\w+)'
match = re.search(pattern, text)
if match:print("命名组:", match.groupdict())# 反向引用
text = "hello hello world world"
pattern = r'(\w+) \1'  # 匹配重复的单词
matches = re.findall(pattern, text)
print("反向引用:", matches)

断言

import re# 正向先行断言
text = "apple pie, apple juice, apple"
pattern = r'apple(?=\s+(?:pie|juice))'
matches = re.findall(pattern, text)
print("正向先行断言:", matches)  # 输出: ['apple', 'apple']# 负向先行断言
pattern = r'apple(?!\s+(?:pie|juice))'
matches = re.findall(pattern, text)
print("负向先行断言:", matches)  # 输出: ['apple']# 正向后行断言
text = "123apple, 456apple, apple"
pattern = r'(?<=\d+)apple'
matches = re.findall(pattern, text)
print("正向后行断言:", matches)  # 输出: ['apple', 'apple']

常用模式

邮箱验证

import redef validate_email(email):pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'return bool(re.match(pattern, email))# 测试
emails = ["user@example.com","invalid-email","user@domain","user.name@domain.co.uk"
]for email in emails:print(f"{email}: {validate_email(email)}")

手机号验证（中国大陆）

import redef validate_phone(phone):pattern = r'^1[3-9]\d{9}$'return bool(re.match(pattern, phone))# 测试
phones = ["13812345678","12345678901","1381234567","23812345678"
]for phone in phones:print(f"{phone}: {validate_phone(phone)}")

身份证号验证（中国大陆）

import redef validate_id_card(id_card):pattern = r'^[1-9]\d{5}(18|19|20)\d{2}((0[1-9])|(1[0-2]))(([0-2][1-9])|10|20|30|31)\d{3}[0-9Xx]$'return bool(re.match(pattern, id_card))# 测试
id_cards = ["110101199001011234","123456789012345","11010119900101123X"
]for id_card in id_cards:print(f"{id_card}: {validate_id_card(id_card)}")

URL验证

import redef validate_url(url):pattern = r'^https?://[^\s/$.?#].[^\s]*$'return bool(re.match(pattern, url))# 测试
urls = ["https://www.example.com","http://example.com/path","ftp://example.com","not-a-url"
]for url in urls:print(f"{url}: {validate_url(url)}")

密码强度验证

import redef validate_password_strength(password):# 至少8位，包含大小写字母、数字和特殊字符pattern = r'^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$'return bool(re.match(pattern, password))# 测试
passwords = ["MyPass123!","weak","NoSpecialChar123","nouppercase123!"
]for password in passwords:print(f"{password}: {validate_password_strength(password)}")

实际应用示例

1. 提取HTML标签内容

import redef extract_html_content(html):pattern = r'<[^>]*>([^<]*)</[^>]*>'matches = re.findall(pattern, html)return matches# 测试
html = "<div>Hello World</div><p>Test</p><span>Content</span>"
contents = extract_html_content(html)
print("HTML内容:", contents)

2. 格式化电话号码

import redef format_phone_number(phone):# 移除所有非数字字符clean_phone = re.sub(r'\D', '', phone)# 格式化为中国手机号格式if len(clean_phone) == 11:return re.sub(r'(\d{3})(\d{4})(\d{4})', r'\1-\2-\3', clean_phone)return phone# 测试
phones = ["13812345678","138-1234-5678","138 1234 5678","138.1234.5678"
]for phone in phones:formatted = format_phone_number(phone)print(f"{phone} -> {formatted}")

3. 解析日志文件

import re
from datetime import datetimedef parse_log_line(log_line):# 假设日志格式: [2023-01-01 12:00:00] INFO: User login successfulpattern = r'\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\] (\w+): (.+)'match = re.match(pattern, log_line)if match:timestamp, level, message = match.groups()return {'timestamp': datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S'),'level': level,'message': message}return None# 测试
log_lines = ["[2023-01-01 12:00:00] INFO: User login successful","[2023-01-01 12:01:15] ERROR: Database connection failed","Invalid log line"
]for line in log_lines:parsed = parse_log_line(line)print(f"原始: {line}")print(f"解析: {parsed}")print()

4. 数据清洗

import redef clean_text(text):# 移除多余的空白字符text = re.sub(r'\s+', ' ', text)# 移除特殊字符（保留中文、英文、数字）text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s]', '', text)# 移除行首行尾空白text = text.strip()return text# 测试
dirty_text = "  Hello   World!  @#$%^&*()  \n\n  测试文本  "
clean = clean_text(dirty_text)
print(f"原始文本: '{dirty_text}'")
print(f"清洗后: '{clean}'")

5. 提取CSV数据

import redef parse_csv_line(line):# 处理包含逗号的字段（用双引号包围）pattern = r'"([^"]*)"|([^,]+)'matches = re.findall(pattern, line)# 提取匹配的内容fields = []for match in matches:field = match[0] if match[0] else match[1]fields.append(field.strip())return fields# 测试
csv_lines = ['name,age,city','John,25,"New York, NY"','Jane,30,"Los Angeles, CA"','Bob,35,Chicago'
]for line in csv_lines:fields = parse_csv_line(line)print(f"原始: {line}")print(f"解析: {fields}")print()

性能优化

1. 使用编译后的正则表达式

import re
import time# 未编译版本
def test_uncompiled(text, iterations=10000):start = time.time()for _ in range(iterations):re.findall(r'\d+', text)return time.time() - start# 编译版本
def test_compiled(text, iterations=10000):pattern = re.compile(r'\d+')start = time.time()for _ in range(iterations):pattern.findall(text)return time.time() - start# 测试性能
text = "abc123def456ghi789" * 100
uncompiled_time = test_uncompiled(text)
compiled_time = test_compiled(text)print(f"未编译时间: {uncompiled_time:.4f}秒")
print(f"编译时间: {compiled_time:.4f}秒")
print(f"性能提升: {uncompiled_time/compiled_time:.2f}倍")

2. 避免回溯灾难

import re# 不好的写法（可能导致回溯灾难）
bad_pattern = r'(a+)+b'# 好的写法
good_pattern = r'a+b'# 测试
text = 'a' * 1000 + 'b'try:# 这可能会很慢re.match(bad_pattern, text)
except:print("回溯灾难发生")# 这个会很快
re.match(good_pattern, text)

常见问题

1. 转义字符问题

import re# 错误：忘记转义
# pattern = r'\d+'  # 在字符串中需要双反斜杠# 正确：使用原始字符串
pattern = r'\d+'# 或者使用双反斜杠
pattern = '\\d+'

2. 贪婪匹配问题

import retext = "<div>content1</div><div>content2</div>"# 贪婪匹配（可能不是期望的结果）
greedy = re.findall(r'<div>.*</div>', text)
print("贪婪匹配:", greedy)# 非贪婪匹配（通常更符合期望）
non_greedy = re.findall(r'<div>.*?</div>', text)
print("非贪婪匹配:", non_greedy)

3. 多行模式

import retext = """line1
line2
line3"""# 不使用多行模式
matches = re.findall(r'^line', text)
print("不使用多行模式:", matches)# 使用多行模式
matches = re.findall(r'^line', text, re.MULTILINE)
print("使用多行模式:", matches)

4. 大小写敏感

import retext = "Hello WORLD"# 区分大小写（默认）
matches = re.findall(r'hello', text)
print("区分大小写:", matches)# 忽略大小写
matches = re.findall(r'hello', text, re.IGNORECASE)
print("忽略大小写:", matches)