当前位置：首页 > news >正文

python基础入门：7.1迭代器与生成器

news 2025/9/11 11:04:40

Python迭代器与生成器深度解析：高效处理海量数据的利器

# 大文件分块读取生成器模板
def chunked_file_reader(file_path, chunk_size=1024*1024):"""分块读取大文件生成器"""with open(file_path, 'r', encoding='utf-8') as f:while True:chunk = f.read(chunk_size)if not chunk:breakyield chunk# 使用示例
for chunk in chunked_file_reader('huge_log.txt'):process_chunk(chunk)

一、迭代器协议深度解析

迭代器协议组成要素

class CustomIterator:"""自定义迭代器实现"""def __init__(self, data):self.data = dataself.index = 0def __iter__(self):return self  # 返回迭代器对象本身def __next__(self):if self.index >= len(self.data):raise StopIterationvalue = self.data[self.index]self.index += 1return value# 使用示例
colors = CustomIterator(['red', 'green', 'blue'])
for color in colors:print(color)

可迭代对象与迭代器区别

特性	可迭代对象	迭代器
实现方法	`__iter__`	`__iter__` + `__next__`
状态保持	无	保持迭代状态
多次迭代	每次创建新迭代器	单次耗尽
内存占用	通常较大	通常较小
典型示例	list, dict, str	file对象, generator

二、生成器核心机制

生成器函数工作原理

def fibonacci_generator(max_count):"""斐波那契数列生成器"""a, b = 0, 1count = 0while count < max_count:yield aa, b = b, a + bcount += 1# 生成器执行状态演示
gen = fibonacci_generator(5)
print(next(gen))  # 0
print(next(gen))  # 1
print(next(gen))  # 1
print(next(gen))  # 2
print(next(gen))  # 3
# next(gen)  # 触发StopIteration

生成器高级特性

# 协程通信
def data_processor():"""带状态的数据处理器"""total = 0count = 0while True:value = yield  # 接收数据if value is None:breaktotal += valuecount += 1return (total, total/count)# 使用示例
proc = data_processor()
next(proc)  # 激活生成器
proc.send(10)
proc.send(20)
proc.send(30)
try:proc.send(None)
except StopIteration as e:print(f"计算结果: {e.value}")  # (60, 20.0)

三、大文件处理实战

多种文件读取方式对比

def read_entire_file(file_path):"""一次性读取整个文件"""with open(file_path) as f:return f.read()  # 内存杀手！def read_line_by_line(file_path):"""逐行读取文件"""with open(file_path) as f:for line in f:yield linedef read_in_chunks(file_path, chunk_size=4096):"""固定块大小读取"""with open(file_path, 'rb') as f:while chunk := f.read(chunk_size):yield chunkdef smart_reader(file_path, max_lines=1000):"""智能分块读取（自动调整块大小）"""buffer = []with open(file_path) as f:for line in f:buffer.append(line)if len(buffer) >= max_lines:yield bufferbuffer = []if buffer:yield buffer

生产级大文件处理器

class LogFileAnalyzer:"""日志文件分析器"""def __init__(self, file_path):self.file_path = file_pathself._line_count = 0self._error_count = 0def __iter__(self):"""实现迭代器协议"""with open(self.file_path, 'r', errors='replace') as f:for line in f:self._line_count += 1try:parsed = self._parse_line(line)except InvalidLogFormat:self._error_count += 1continueyield parseddef _parse_line(self, line):"""解析单行日志"""if 'ERROR' in line:return self._parse_error(line)# 其他解析逻辑...@propertydef stats(self):return {'total_lines': self._line_count,'error_lines': self._error_count}# 使用示例
analyzer = LogFileAnalyzer('server.log')
for entry in analyzer:process_log_entry(entry)
print(f"分析统计: {analyzer.stats}")

四、性能优化与高级技巧

生成器表达式优化

# 传统列表推导式（立即加载所有数据）
squares = [x**2 for x in range(1000000)]  # 占用大量内存# 生成器表达式（惰性计算）
squares_gen = (x**2 for x in range(1000000))  # 内存友好# 管道式处理
result = (x * 2 for x in squares_gen if x % 3 == 0
)

内存优化对比测试

import sysdata_list = [i for i in range(1000000)]
print(f"列表内存占用: {sys.getsizeof(data_list)/1024/1024:.2f} MB")data_gen = (i for i in range(1000000))
print(f"生成器内存占用: {sys.getsizeof(data_gen)} bytes")

异步生成器（Python 3.6+）

import aiofilesasync def async_file_reader(file_path):"""异步文件读取生成器"""async with aiofiles.open(file_path, mode='r') as f:async for line in f:  # 异步迭代yield line.strip()# 使用示例
async def process_large_file():async for line in async_file_reader('bigfile.txt'):await process_line(line)

最佳实践清单：

优先使用生成器处理大型数据集
避免在生成器中修改外部状态
使用itertools模块优化迭代逻辑
对无限生成器设置安全终止条件
合理选择块大小（通常4KB-1MB）
使用yield from简化嵌套生成器
结合上下文管理器管理资源
使用类型注解提高可读性

# 带类型注解的生成器
from typing import Generator, Iteratordef countdown(n: int) -> Generator[int, None, None]:"""倒计时生成器"""while n > 0:yield nn -= 1# 使用yield from
def flatten(nested_list) -> Iterator:"""嵌套列表展开器"""for item in nested_list:if isinstance(item, (list, tuple)):yield from flatten(item)else:yield item