当前位置：首页 > news >正文

【AI Study】第四天，Pandas（7）- 实际应用

news 2025/6/19 19:28:41

文章概要

本文详细介绍 Pandas 在实际场景中的应用，包括：

数据分析案例
数据挖掘
数据报告
实际应用示例

数据分析案例

金融数据分析

# 股票数据分析
import pandas as pd
import numpy as np
import yfinance as yf# 获取股票数据
def get_stock_data(ticker, start_date, end_date):stock = yf.Ticker(ticker)df = stock.history(start=start_date, end=end_date)return df# 计算技术指标
def calculate_technical_indicators(df):# 计算移动平均线df['MA5'] = df['Close'].rolling(window=5).mean()df['MA20'] = df['Close'].rolling(window=20).mean()# 计算相对强弱指标 (RSI)delta = df['Close'].diff()gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()rs = gain / lossdf['RSI'] = 100 - (100 / (1 + rs))# 计算布林带df['MA20'] = df['Close'].rolling(window=20).mean()df['std'] = df['Close'].rolling(window=20).std()df['Upper'] = df['MA20'] + (df['std'] * 2)df['Lower'] = df['MA20'] - (df['std'] * 2)return df# 分析示例
ticker = 'AAPL'
start_date = '2023-01-01'
end_date = '2023-12-31'# 获取数据
df = get_stock_data(ticker, start_date, end_date)# 计算技术指标
df = calculate_technical_indicators(df)# 分析结果
print("股票数据分析结果：")
print(f"平均收盘价：{df['Close'].mean():.2f}")
print(f"最高价：{df['High'].max():.2f}")
print(f"最低价：{df['Low'].min():.2f}")
print(f"交易量：{df['Volume'].sum():,}")

销售数据分析

# 销售数据分析
def analyze_sales_data(df):# 按产品类别分析category_analysis = df.groupby('category').agg({'sales': 'sum','quantity': 'sum','price': 'mean'}).round(2)# 按时间分析df['date'] = pd.to_datetime(df['date'])time_analysis = df.groupby(df['date'].dt.month).agg({'sales': 'sum','quantity': 'sum'}).round(2)# 计算关键指标total_sales = df['sales'].sum()avg_order_value = total_sales / len(df)top_products = df.groupby('product')['sales'].sum().nlargest(5)return {'category_analysis': category_analysis,'time_analysis': time_analysis,'total_sales': total_sales,'avg_order_value': avg_order_value,'top_products': top_products}# 使用示例
sales_data = pd.DataFrame({'date': pd.date_range('2023-01-01', periods=100),'product': np.random.choice(['A', 'B', 'C', 'D', 'E'], 100),'category': np.random.choice(['Electronics', 'Clothing', 'Food'], 100),'quantity': np.random.randint(1, 10, 100),'price': np.random.uniform(10, 100, 100)
})
sales_data['sales'] = sales_data['quantity'] * sales_data['price']# 分析数据
results = analyze_sales_data(sales_data)
print("销售数据分析结果：")
print(results)

用户行为分析

# 用户行为分析
def analyze_user_behavior(df):# 用户活跃度分析user_activity = df.groupby('user_id').agg({'session_id': 'count','duration': 'sum','page_views': 'sum'}).rename(columns={'session_id': 'session_count','duration': 'total_duration','page_views': 'total_page_views'})# 用户路径分析user_paths = df.groupby('user_id')['page'].agg(list)# 计算用户留存率def calculate_retention(df):first_visit = df.groupby('user_id')['date'].min()return df[df['date'] > first_visit].groupby('user_id').size()retention = calculate_retention(df)return {'user_activity': user_activity,'user_paths': user_paths,'retention': retention}# 使用示例
user_data = pd.DataFrame({'user_id': np.random.randint(1, 100, 1000),'session_id': np.random.randint(1, 50, 1000),'date': pd.date_range('2023-01-01', periods=1000),'duration': np.random.randint(1, 3600, 1000),'page_views': np.random.randint(1, 20, 1000),'page': np.random.choice(['home', 'product', 'cart', 'checkout'], 1000)
})# 分析数据
results = analyze_user_behavior(user_data)
print("用户行为分析结果：")
print(results)

数据挖掘

特征工程

# 特征工程
def feature_engineering(df):# 时间特征df['hour'] = df['timestamp'].dt.hourdf['day_of_week'] = df['timestamp'].dt.dayofweekdf['month'] = df['timestamp'].dt.month# 数值特征df['price_per_unit'] = df['price'] / df['quantity']df['discount_rate'] = (df['original_price'] - df['price']) / df['original_price']# 分类特征df['price_category'] = pd.cut(df['price'], bins=[0, 10, 50, 100, float('inf')],labels=['low', 'medium', 'high', 'premium'])# 交互特征df['price_quantity'] = df['price'] * df['quantity']return df# 使用示例
df = pd.DataFrame({'timestamp': pd.date_range('2023-01-01', periods=100),'price': np.random.uniform(10, 200, 100),'original_price': np.random.uniform(20, 250, 100),'quantity': np.random.randint(1, 10, 100)
})# 特征工程
df = feature_engineering(df)
print("特征工程结果：")
print(df.head())

数据预处理

# 数据预处理
def preprocess_data(df):# 处理缺失值df = df.fillna({'numeric_column': df['numeric_column'].mean(),'categorical_column': df['categorical_column'].mode()[0]})# 处理异常值def remove_outliers(df, column, n_std):mean = df[column].mean()std = df[column].std()df = df[abs(df[column] - mean) <= (n_std * std)]return df# 标准化数值特征from sklearn.preprocessing import StandardScalerscaler = StandardScaler()df['scaled_column'] = scaler.fit_transform(df[['numeric_column']])# 编码分类特征df = pd.get_dummies(df, columns=['categorical_column'])return df# 使用示例
df = pd.DataFrame({'numeric_column': np.random.randn(100),'categorical_column': np.random.choice(['A', 'B', 'C'], 100)
})# 添加一些缺失值和异常值
df.loc[10:15, 'numeric_column'] = np.nan
df.loc[20:25, 'categorical_column'] = np.nan
df.loc[30, 'numeric_column'] = 100  # 异常值# 预处理数据
df = preprocess_data(df)
print("数据预处理结果：")
print(df.head())

模型数据准备

# 模型数据准备
def prepare_model_data(df, target_column):# 分离特征和目标X = df.drop(target_column, axis=1)y = df[target_column]# 划分训练集和测试集from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)# 特征选择from sklearn.feature_selection import SelectKBest, f_regressionselector = SelectKBest(f_regression, k=5)X_train_selected = selector.fit_transform(X_train, y_train)X_test_selected = selector.transform(X_test)return {'X_train': X_train,'X_test': X_test,'y_train': y_train,'y_test': y_test,'X_train_selected': X_train_selected,'X_test_selected': X_test_selected}# 使用示例
df = pd.DataFrame({'feature1': np.random.randn(100),'feature2': np.random.randn(100),'feature3': np.random.randn(100),'target': np.random.randn(100)
})# 准备模型数据
model_data = prepare_model_data(df, 'target')
print("模型数据准备结果：")
print(f"训练集大小：{model_data['X_train'].shape}")
print(f"测试集大小：{model_data['X_test'].shape}")

数据报告

数据汇总

# 数据汇总
def generate_summary_report(df):# 基本统计信息numeric_summary = df.describe()# 分类数据汇总categorical_summary = df.select_dtypes(include=['object']).describe()# 缺失值统计missing_values = df.isnull().sum()# 相关性分析correlation = df.select_dtypes(include=[np.number]).corr()return {'numeric_summary': numeric_summary,'categorical_summary': categorical_summary,'missing_values': missing_values,'correlation': correlation}# 使用示例
df = pd.DataFrame({'numeric_col': np.random.randn(100),'categorical_col': np.random.choice(['A', 'B', 'C'], 100),'date_col': pd.date_range('2023-01-01', periods=100)
})# 生成汇总报告
report = generate_summary_report(df)
print("数据汇总报告：")
print(report)

报表生成

# 报表生成
def generate_report(df, output_file):# 创建 Excel 写入器writer = pd.ExcelWriter(output_file, engine='xlsxwriter')# 写入数据摘要df.describe().to_excel(writer, sheet_name='数据摘要')# 写入分类统计df.select_dtypes(include=['object']).describe().to_excel(writer, sheet_name='分类统计')# 写入时间序列分析if 'date' in df.columns:df.groupby(df['date'].dt.month)['value'].mean().to_excel(writer, sheet_name='时间序列分析')# 保存报表writer.close()# 使用示例
df = pd.DataFrame({'date': pd.date_range('2023-01-01', periods=100),'value': np.random.randn(100),'category': np.random.choice(['A', 'B', 'C'], 100)
})# 生成报表
generate_report(df, 'analysis_report.xlsx')

自动化报告

# 自动化报告
def generate_automated_report(df, output_file):# 创建报告from reportlab.lib import colorsfrom reportlab.lib.pagesizes import letterfrom reportlab.platypus import SimpleDocTemplate, Table, TableStyledoc = SimpleDocTemplate(output_file, pagesize=letter)elements = []# 添加标题from reportlab.platypus import Paragraphfrom reportlab.lib.styles import getSampleStyleSheetstyles = getSampleStyleSheet()elements.append(Paragraph("数据分析报告", styles['Title']))# 添加数据摘要summary = df.describe()t = Table(summary.values.tolist())t.setStyle(TableStyle([('BACKGROUND', (0, 0), (-1, 0), colors.grey),('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),('ALIGN', (0, 0), (-1, -1), 'CENTER'),('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),('FONTSIZE', (0, 0), (-1, 0), 14),('BOTTOMPADDING', (0, 0), (-1, 0), 12),('BACKGROUND', (0, 1), (-1, -1), colors.beige),('TEXTCOLOR', (0, 1), (-1, -1), colors.black),('FONTNAME', (0, 1), (-1, -1), 'Helvetica'),('FONTSIZE', (0, 1), (-1, -1), 12),('GRID', (0, 0), (-1, -1), 1, colors.black)]))elements.append(t)# 生成报告doc.build(elements)# 使用示例
df = pd.DataFrame({'value': np.random.randn(100),'category': np.random.choice(['A', 'B', 'C'], 100)
})# 生成自动化报告
generate_automated_report(df, 'automated_report.pdf')