Python数据处理工具示例

Pandas数据分析

数据清洗与预处理


import pandas as pd
import numpy as np
from datetime import datetime

# 示例数据
data = {
    'date': ['2024-03-01', '2024-03-02', '2024-03-03', '2024-03-04', '2024-03-05'],
    'product': ['A', 'B', 'A', 'C', 'B'],
    'price': [100, 150, np.nan, 200, 180],
    'quantity': [5, 3, 4, np.nan, 2],
    'category': ['电子', '服装', '电子', '食品', '服装']
}

# 创建DataFrame
df = pd.DataFrame(data)

class DataCleaner:
    """数据清洗工具类"""
    
    def __init__(self, df):
        self.df = df.copy()
    
    def handle_missing_values(self, strategy='mean'):
        """处理缺失值
        strategy: 'mean' - 均值填充
                 'median' - 中位数填充
                 'mode' - 众数填充
                 'ffill' - 前向填充
                 'bfill' - 后向填充
        """
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        
        for col in numeric_cols:
            if strategy == 'mean':
                self.df[col].fillna(self.df[col].mean(), inplace=True)
            elif strategy == 'median':
                self.df[col].fillna(self.df[col].median(), inplace=True)
            elif strategy == 'mode':
                self.df[col].fillna(self.df[col].mode()[0], inplace=True)
            elif strategy == 'ffill':
                self.df[col].fillna(method='ffill', inplace=True)
            elif strategy == 'bfill':
                self.df[col].fillna(method='bfill', inplace=True)
        
        return self
    
    def convert_dates(self, date_cols):
        """转换日期格式"""
        for col in date_cols:
            self.df[col] = pd.to_datetime(self.df[col])
        return self
    
    def remove_duplicates(self):
        """删除重复行"""
        self.df.drop_duplicates(inplace=True)
        return self
    
    def normalize_column(self, col):
        """标准化数值列"""
        self.df[f'{col}_normalized'] = (
            self.df[col] - self.df[col].mean()
        ) / self.df[col].std()
        return self
    
    def create_categories(self, col, bins, labels):
        """创建分类"""
        self.df[f'{col}_category'] = pd.cut(
            self.df[col],
            bins=bins,
            labels=labels
        )
        return self
    
    def get_clean_data(self):
        """获取清洗后的数据"""
        return self.df

# 使用示例
cleaner = DataCleaner(df)
clean_df = (cleaner
    .convert_dates(['date'])
    .handle_missing_values('mean')
    .remove_duplicates()
    .normalize_column('price')
    .create_categories('quantity', 
                      bins=[0, 2, 4, float('inf')],
                      labels=['低', '中', '高'])
    .get_clean_data())

print("原始数据：")
print(df)
print("\n清洗后的数据：")
print(clean_df)

# 数据分析示例
def analyze_sales_data(df):
    """销售数据分析"""
    # 基本统计
    stats = {
        '总销售额': (df['price'] * df['quantity']).sum(),
        '平均单价': df['price'].mean(),
        '最畅销产品': df.groupby('product')['quantity'].sum().idxmax(),
        '各类别销售占比': (df.groupby('category')['price'].sum() / 
                     df['price'].sum() * 100).round(2)
    }
    
    # 时间趋势
    daily_sales = df.groupby('date').agg({
        'price': lambda x: (x * df.loc[x.index, 'quantity']).sum(),
        'quantity': 'sum'
    }).rename(columns={'price': 'sales_amount'})
    
    # 产品分析
    product_analysis = df.groupby('product').agg({
        'quantity': 'sum',
        'price': 'mean'
    }).round(2)
    
    return {
        'basic_stats': stats,
        'daily_sales': daily_sales,
        'product_analysis': product_analysis
    }

# 分析结果
results = analyze_sales_data(clean_df)
print("\n分析结果：")
for key, value in results.items():
    print(f"\n{key}:")
    print(value)

NumPy科学计算

矩阵运算与统计分析


import numpy as np

class MatrixCalculator:
    """矩阵计算工具类"""
    
    def __init__(self, data=None):
        self.data = np.array(data) if data is not None else None
    
    def generate_random_matrix(self, rows, cols, distribution='normal'):
        """生成随机矩阵
        distribution: 'normal' - 正态分布
                     'uniform' - 均匀分布
                     'binomial' - 二项分布
        """
        if distribution == 'normal':
            self.data = np.random.normal(0, 1, (rows, cols))
        elif distribution == 'uniform':
            self.data = np.random.uniform(-1, 1, (rows, cols))
        elif distribution == 'binomial':
            self.data = np.random.binomial(1, 0.5, (rows, cols))
        return self
    
    def matrix_operations(self):
        """基本矩阵运算"""
        if self.data is None:
            raise ValueError("需要先初始化矩阵")
        
        results = {
            '转置': self.data.T,
            '行列式': np.linalg.det(self.data) if self.data.shape[0] == self.data.shape[1] else None,
            '逆矩阵': np.linalg.inv(self.data) if self.data.shape[0] == self.data.shape[1] else None,
            '特征值': np.linalg.eigvals(self.data) if self.data.shape[0] == self.data.shape[1] else None,
            'SVD分解': np.linalg.svd(self.data)
        }
        return results
    
    def statistical_analysis(self):
        """统计分析"""
        if self.data is None:
            raise ValueError("需要先初始化矩阵")
        
        stats = {
            '均值': np.mean(self.data),
            '标准差': np.std(self.data),
            '最小值': np.min(self.data),
            '最大值': np.max(self.data),
            '中位数': np.median(self.data),
            '方差': np.var(self.data)
        }
        return stats
    
    def matrix_decomposition(self):
        """矩阵分解"""
        if self.data is None or self.data.shape[0] != self.data.shape[1]:
            raise ValueError("需要先初始化方阵")
        
        # QR分解
        Q, R = np.linalg.qr(self.data)
        
        # LU分解
        P, L, U = scipy.linalg.lu(self.data)
        
        # 特征值分解
        eigenvals, eigenvecs = np.linalg.eig(self.data)
        
        return {
            'QR分解': {'Q': Q, 'R': R},
            'LU分解': {'P': P, 'L': L, 'U': U},
            '特征值分解': {'eigenvals': eigenvals, 'eigenvecs': eigenvecs}
        }

# 使用示例
calc = MatrixCalculator()

# 生成3x3随机矩阵
matrix = calc.generate_random_matrix(3, 3, 'normal').data
print("随机矩阵：")
print(matrix)

# 基本运算
operations = calc.matrix_operations()
print("\n矩阵运算结果：")
for op, result in operations.items():
    if result is not None:
        print(f"\n{op}:")
        print(result)

# 统计分析
stats = calc.statistical_analysis()
print("\n统计分析结果：")
for stat, value in stats.items():
    print(f"{stat}: {value}")

# 图像处理示例
def image_processing_example():
    """NumPy图像处理示例"""
    # 创建示例图像 (100x100的灰度图)
    image = np.random.randint(0, 256, (100, 100))
    
    # 图像处理操作
    operations = {
        '原始图像': image,
        '高斯模糊': gaussian_filter(image, sigma=2),
        '边缘检测': sobel(image),
        '二值化': image > np.mean(image),
        '旋转90度': np.rot90(image),
        '水平翻转': np.fliplr(image),
        '垂直翻转': np.flipud(image)
    }
    
    # 图像统计信息
    stats = {
        '平均亮度': np.mean(image),
        '亮度标准差': np.std(image),
        '最大亮度': np.max(image),
        '最小亮度': np.min(image),
        '直方图': np.histogram(image, bins=50)[0]
    }
    
    return operations, stats

# 数值计算示例
def numerical_computing_example():
    """数值计算示例"""
    # 插值
    x = np.linspace(0, 10, 10)
    y = np.sin(x)
    x_new = np.linspace(0, 10, 50)
    y_new = np.interp(x_new, x, y)
    
    # 积分
    def f(x):
        return np.sin(x)
    
    integral = np.trapz(f(x), x)
    
    # 求根
    def g(x):
        return x**2 - 4
    
    root = fsolve(g, x0=1)[0]
    
    # 最小二乘拟合
    x_fit = np.linspace(0, 10, 20)
    y_fit = 3 * x_fit**2 + 2 * x_fit + 1 + np.random.normal(0, 10, 20)
    coeffs = np.polyfit(x_fit, y_fit, 2)
    
    return {
        '插值': {'x': x_new, 'y': y_new},
        '积分': integral,
        '求根': root,
        '拟合系数': coeffs
    }

# 运行示例
image_ops, image_stats = image_processing_example()
numerical_results = numerical_computing_example()

print("\n图像处理结果：")
for op, img in image_ops.items():
    print(f"\n{op} 形状:", img.shape)

print("\n图像统计信息：")
for stat, value in image_stats.items():
    if stat != '直方图':
        print(f"{stat}: {value}")

print("\n数值计算结果：")
for calc, result in numerical_results.items():
    print(f"\n{calc}:")
    print(result)

数据可视化

Matplotlib绘图示例


import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.animation import FuncAnimation
import numpy as np
import pandas as pd

class DataVisualizer:
    """数据可视化工具类"""
    
    def __init__(self):
        # 设置中文字体
        plt.rcParams['font.sans-serif'] = ['SimHei']
        plt.rcParams['axes.unicode_minus'] = False
        # 设置Seaborn样式
        sns.set_style("whitegrid")
    
    def plot_line_chart(self, x, y, title="折线图", xlabel="X轴", ylabel="Y轴"):
        """绘制折线图"""
        plt.figure(figsize=(10, 6))
        plt.plot(x, y, marker='o')
        plt.title(title)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.grid(True)
        plt.show()
    
    def plot_scatter(self, x, y, colors=None, sizes=None, title="散点图"):
        """绘制散点图"""
        plt.figure(figsize=(10, 6))
        plt.scatter(x, y, c=colors, s=sizes)
        plt.title(title)
        plt.colorbar()
        plt.show()
    
    def plot_bar_chart(self, categories, values, title="柱状图"):
        """绘制柱状图"""
        plt.figure(figsize=(10, 6))
        plt.bar(categories, values)
        plt.title(title)
        plt.xticks(rotation=45)
        plt.show()
    
    def plot_pie_chart(self, sizes, labels, title="饼图"):
        """绘制饼图"""
        plt.figure(figsize=(10, 6))
        plt.pie(sizes, labels=labels, autopct='%1.1f%%')
        plt.title(title)
        plt.axis('equal')
        plt.show()
    
    def plot_heatmap(self, data, title="热力图"):
        """绘制热力图"""
        plt.figure(figsize=(10, 8))
        sns.heatmap(data, annot=True, cmap='YlOrRd')
        plt.title(title)
        plt.show()
    
    def plot_distribution(self, data, title="分布图"):
        """绘制分布图"""
        plt.figure(figsize=(10, 6))
        sns.histplot(data=data, kde=True)
        plt.title(title)
        plt.show()
    
    def plot_box(self, data, title="箱线图"):
        """绘制箱线图"""
        plt.figure(figsize=(10, 6))
        sns.boxplot(data=data)
        plt.title(title)
        plt.show()
    
    def plot_3d_surface(self, X, Y, Z, title="3D曲面图"):
        """绘制3D曲面图"""
        fig = plt.figure(figsize=(10, 8))
        ax = fig.add_subplot(111, projection='3d')
        surf = ax.plot_surface(X, Y, Z, cmap='viridis')
        plt.colorbar(surf)
        ax.set_title(title)
        plt.show()
    
    def create_animation(self, func, frames, interval=50):
        """创建动画"""
        fig, ax = plt.subplots()
        ani = FuncAnimation(fig, func, frames=frames,
                          interval=interval, blit=True)
        plt.show()
        return ani

# 使用示例
visualizer = DataVisualizer()

# 生成示例数据
np.random.seed(42)
x = np.linspace(0, 10, 100)
y = np.sin(x) + np.random.normal(0, 0.1, 100)

# 折线图示例
visualizer.plot_line_chart(x, y, title="正弦波形图",
                          xlabel="时间", ylabel="振幅")

# 散点图示例
colors = np.random.rand(100)
sizes = np.random.rand(100) * 200
visualizer.plot_scatter(x, y, colors=colors, sizes=sizes,
                       title="带颜色和大小的散点图")

# 柱状图示例
categories = ['A', 'B', 'C', 'D', 'E']
values = np.random.randint(10, 50, 5)
visualizer.plot_bar_chart(categories, values, title="销售数据")

# 饼图示例
sizes = [30, 20, 25, 15, 10]
labels = ['产品A', '产品B', '产品C', '产品D', '产品E']
visualizer.plot_pie_chart(sizes, labels, title="市场份额")

# 热力图示例
correlation_matrix = np.random.rand(5, 5)
visualizer.plot_heatmap(correlation_matrix, title="相关性矩阵")

# 分布图示例
data = np.random.normal(0, 1, 1000)
visualizer.plot_distribution(data, title="正态分布")

# 箱线图示例
data = [np.random.normal(0, std, 100) for std in range(1, 4)]
visualizer.plot_box(data, title="不同组的分布")

# 3D图示例
X = np.linspace(-5, 5, 100)
Y = np.linspace(-5, 5, 100)
X, Y = np.meshgrid(X, Y)
Z = np.sin(np.sqrt(X**2 + Y**2))
visualizer.plot_3d_surface(X, Y, Z, title="3D波形图")

# 动画示例
def update(frame):
    plt.cla()
    x = np.linspace(0, 10, 100)
    y = np.sin(x + frame/10)
    line = plt.plot(x, y)
    return line

visualizer.create_animation(update, frames=100, interval=50)

# 高级可视化示例
def advanced_visualization_example():
    """高级可视化示例"""
    # 创建示例数据
    np.random.seed(42)
    dates = pd.date_range(start='2024-01-01', periods=100)
    data = pd.DataFrame({
        'date': dates,
        'value1': np.random.normal(0, 1, 100).cumsum(),
        'value2': np.random.normal(0, 1, 100).cumsum(),
        'category': np.random.choice(['A', 'B', 'C'], 100)
    })
    
    # 子图布局
    fig = plt.figure(figsize=(15, 10))
    gs = fig.add_gridspec(2, 2)
    
    # 时间序列图
    ax1 = fig.add_subplot(gs[0, :])
    data.plot(x='date', y=['value1', 'value2'], ax=ax1)
    ax1.set_title('时间序列趋势')
    
    # 散点图矩阵
    ax2 = fig.add_subplot(gs[1, 0])
    sns.scatterplot(data=data, x='value1', y='value2',
                   hue='category', ax=ax2)
    ax2.set_title('散点图矩阵')
    
    # 小提琴图
    ax3 = fig.add_subplot(gs[1, 1])
    sns.violinplot(data=data, x='category', y='value1', ax=ax3)
    ax3.set_title('分布小提琴图')
    
    plt.tight_layout()
    plt.show()

# 运行高级可视化示例
advanced_visualization_example()

实际应用

股票数据分析


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
from datetime import datetime, timedelta
import talib

class StockAnalyzer:
    """股票数据分析工具"""
    
    def __init__(self, symbol, start_date=None, end_date=None):
        self.symbol = symbol
        self.start_date = start_date or (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')
        self.end_date = end_date or datetime.now().strftime('%Y-%m-%d')
        self.data = None
        self.load_data()
    
    def load_data(self):
        """加载股票数据"""
        self.data = yf.download(self.symbol, self.start_date, self.end_date)
        return self
    
    def calculate_technical_indicators(self):
        """计算技术指标"""
        # 移动平均线
        self.data['MA5'] = self.data['Close'].rolling(window=5).mean()
        self.data['MA20'] = self.data['Close'].rolling(window=20).mean()
        self.data['MA60'] = self.data['Close'].rolling(window=60).mean()
        
        # RSI指标
        self.data['RSI'] = talib.RSI(self.data['Close'].values)
        
        # MACD指标
        macd, signal, hist = talib.MACD(self.data['Close'].values)
        self.data['MACD'] = macd
        self.data['Signal'] = signal
        self.data['Hist'] = hist
        
        # 布林带
        upper, middle, lower = talib.BBANDS(self.data['Close'].values)
        self.data['BB_upper'] = upper
        self.data['BB_middle'] = middle
        self.data['BB_lower'] = lower
        
        return self
    
    def calculate_returns(self):
        """计算收益率"""
        self.data['Daily_Return'] = self.data['Close'].pct_change()
        self.data['Cumulative_Return'] = (1 + self.data['Daily_Return']).cumprod()
        return self
    
    def plot_price_chart(self):
        """绘制价格走势图"""
        plt.figure(figsize=(15, 10))
        
        # 主图：价格和移动平均线
        plt.subplot(2, 1, 1)
        plt.plot(self.data.index, self.data['Close'], label='收盘价')
        plt.plot(self.data.index, self.data['MA5'], label='MA5')
        plt.plot(self.data.index, self.data['MA20'], label='MA20')
        plt.plot(self.data.index, self.data['MA60'], label='MA60')
        plt.fill_between(self.data.index, 
                        self.data['BB_upper'],
                        self.data['BB_lower'],
                        alpha=0.1)
        plt.title(f'{self.symbol}价格走势')
        plt.legend()
        
        # 副图：成交量
        plt.subplot(2, 1, 2)
        plt.bar(self.data.index, self.data['Volume'])
        plt.title('成交量')
        
        plt.tight_layout()
        plt.show()
    
    def plot_technical_indicators(self):
        """绘制技术指标"""
        plt.figure(figsize=(15, 12))
        
        # RSI
        plt.subplot(3, 1, 1)
        plt.plot(self.data.index, self.data['RSI'])
        plt.axhline(y=70, color='r', linestyle='--')
        plt.axhline(y=30, color='g', linestyle='--')
        plt.title('RSI指标')
        
        # MACD
        plt.subplot(3, 1, 2)
        plt.plot(self.data.index, self.data['MACD'], label='MACD')
        plt.plot(self.data.index, self.data['Signal'], label='Signal')
        plt.bar(self.data.index, self.data['Hist'])
        plt.legend()
        plt.title('MACD指标')
        
        # 累计收益率
        plt.subplot(3, 1, 3)
        plt.plot(self.data.index, self.data['Cumulative_Return'])
        plt.title('累计收益率')
        
        plt.tight_layout()
        plt.show()
    
    def calculate_statistics(self):
        """计算统计指标"""
        stats = {
            '年化收益率': self.data['Daily_Return'].mean() * 252,
            '年化波动率': self.data['Daily_Return'].std() * np.sqrt(252),
            '夏普比率': (self.data['Daily_Return'].mean() * 252) / 
                    (self.data['Daily_Return'].std() * np.sqrt(252)),
            '最大回撤': (self.data['Close'] / self.data['Close'].cummax() - 1).min(),
            '盈利天数': len(self.data[self.data['Daily_Return'] > 0]),
            '亏损天数': len(self.data[self.data['Daily_Return'] < 0])
        }
        return stats

# 使用示例
def stock_analysis_example():
    """股票分析示例"""
    # 创建分析器实例
    analyzer = StockAnalyzer('AAPL')  # 分析苹果公司股票
    
    # 计算指标
    analyzer.calculate_technical_indicators()
    analyzer.calculate_returns()
    
    # 绘制图表
    analyzer.plot_price_chart()
    analyzer.plot_technical_indicators()
    
    # 输出统计数据
    stats = analyzer.calculate_statistics()
    print("\n统计指标：")
    for key, value in stats.items():
        print(f"{key}: {value:.4f}")

# 运行示例
stock_analysis_example()

气象数据分析


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

class WeatherAnalyzer:
    """气象数据分析工具"""
    
    def __init__(self, data_file):
        self.data = pd.read_csv(data_file)
        self.processed_data = None
    
    def preprocess_data(self):
        """数据预处理"""
        # 处理缺失值
        self.processed_data = self.data.fillna(method='ffill')
        
        # 转换日期列
        if 'date' in self.processed_data.columns:
            self.processed_data['date'] = pd.to_datetime(self.processed_data['date'])
        
        # 标准化数值列
        numeric_cols = self.processed_data.select_dtypes(include=[np.number]).columns
        scaler = StandardScaler()
        self.processed_data[numeric_cols] = scaler.fit_transform(
            self.processed_data[numeric_cols]
        )
        
        return self
    
    def analyze_temperature_patterns(self):
        """分析温度模式"""
        temp_data = self.processed_data[['date', 'temperature']]
        
        # 按月份统计
        monthly_stats = temp_data.set_index('date').resample('M').agg({
            'temperature': ['mean', 'std', 'min', 'max']
        })
        
        # 绘制月度统计图
        plt.figure(figsize=(15, 6))
        monthly_stats['temperature']['mean'].plot(kind='line', label='平均温度')
        plt.fill_between(
            monthly_stats.index,
            monthly_stats['temperature']['mean'] - monthly_stats['temperature']['std'],
            monthly_stats['temperature']['mean'] + monthly_stats['temperature']['std'],
            alpha=0.2
        )
        plt.title('月度温度趋势')
        plt.legend()
        plt.show()
        
        return monthly_stats
    
    def analyze_correlations(self):
        """分析相关性"""
        numeric_cols = self.processed_data.select_dtypes(include=[np.number]).columns
        corr_matrix = self.processed_data[numeric_cols].corr()
        
        plt.figure(figsize=(10, 8))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
        plt.title('气象要素相关性')
        plt.show()
        
        return corr_matrix
    
    def perform_pca_analysis(self):
        """主成分分析"""
        numeric_cols = self.processed_data.select_dtypes(include=[np.number]).columns
        pca = PCA()
        pca_result = pca.fit_transform(self.processed_data[numeric_cols])
        
        # 绘制解释方差比
        plt.figure(figsize=(10, 6))
        plt.plot(range(1, len(pca.explained_variance_ratio_) + 1),
                np.cumsum(pca.explained_variance_ratio_))
        plt.xlabel('主成分数量')
        plt.ylabel('累计解释方差比')
        plt.title('PCA分析')
        plt.show()
        
        return pca_result
    
    def cluster_weather_patterns(self, n_clusters=3):
        """聚类分析天气模式"""
        numeric_cols = self.processed_data.select_dtypes(include=[np.number]).columns
        
        # K-means聚类
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(self.processed_data[numeric_cols])
        
        # 添加聚类标签
        self.processed_data['cluster'] = clusters
        
        # 可视化聚类结果
        plt.figure(figsize=(12, 8))
        for i in range(n_clusters):
            cluster_data = self.processed_data[self.processed_data['cluster'] == i]
            plt.scatter(cluster_data['temperature'],
                       cluster_data['humidity'],
                       label=f'类别 {i}')
        plt.xlabel('温度')
        plt.ylabel('湿度')
        plt.title('天气模式聚类')
        plt.legend()
        plt.show()
        
        return clusters
    
    def generate_report(self):
        """生成分析报告"""
        report = {
            '数据概况': self.processed_data.describe(),
            '缺失值统计': self.processed_data.isnull().sum(),
            '异常值检测': {
                col: self.processed_data[col].quantile([0.01, 0.99]).to_dict()
                for col in self.processed_data.select_dtypes(include=[np.number]).columns
            }
        }
        
        # 时间特征
        if 'date' in self.processed_data.columns:
            self.processed_data['year'] = self.processed_data['date'].dt.year
            self.processed_data['month'] = self.processed_data['date'].dt.month
            
            report['年度统计'] = self.processed_data.groupby('year').mean()
            report['月度统计'] = self.processed_data.groupby('month').mean()
        
        return report

# 使用示例
def weather_analysis_example():
    """气象分析示例"""
    # 创建示例数据
    dates = pd.date_range(start='2023-01-01', end='2023-12-31', freq='D')
    np.random.seed(42)
    
    data = pd.DataFrame({
        'date': dates,
        'temperature': np.random.normal(20, 5, len(dates)) + 
              5 * np.sin(np.arange(len(dates)) * 2 * np.pi / 365),
        'humidity': np.random.normal(60, 10, len(dates)),
        'pressure': np.random.normal(1013, 5, len(dates)),
        'wind_speed': np.random.exponential(5, len(dates))
    })
    
    # 保存为CSV文件
    data.to_csv('weather_data.csv', index=False)
    
    # 创建分析器实例
    analyzer = WeatherAnalyzer('weather_data.csv')
    
    # 数据预处理
    analyzer.preprocess_data()
    
    # 分析温度模式
    monthly_stats = analyzer.analyze_temperature_patterns()
    
    # 相关性分析
    corr_matrix = analyzer.analyze_correlations()
    
    # PCA分析
    pca_result = analyzer.perform_pca_analysis()
    
    # 聚类分析
    clusters = analyzer.cluster_weather_patterns()
    
    # 生成报告
    report = analyzer.generate_report()
    
    print("\n分析报告：")
    for section, content in report.items():
        print(f"\n{section}:")
        print(content)

# 运行示例
weather_analysis_example()