引言

Python数据分析已成为当今数据科学领域的核心技能之一。从初学者到专家,掌握Python数据分析不仅需要理解基础语法,更需要深入理解数据处理、可视化、统计分析和机器学习等进阶技巧。本课程将带你从入门到精通,通过实战技巧和真实项目案例,帮助你构建完整的数据分析能力体系。

第一部分:Python数据分析基础回顾

1.1 Python环境配置与工具链

在开始进阶学习前,确保你的开发环境配置正确。推荐使用Anaconda发行版,它包含了Python、Jupyter Notebook、NumPy、Pandas等常用库。

# 安装Anaconda(推荐)
# 访问 https://www.anaconda.com/products/distribution 下载安装

# 创建新的conda环境
conda create -n data_analysis python=3.9
conda activate data_analysis

# 安装核心数据分析库
conda install numpy pandas matplotlib seaborn scikit-learn jupyter

1.2 核心库快速回顾

NumPy:科学计算基础

import numpy as np

# 创建数组
arr = np.array([1, 2, 3, 4, 5])
print(f"数组: {arr}")
print(f"形状: {arr.shape}")
print(f"数据类型: {arr.dtype}")

# 数组运算
arr2 = np.array([6, 7, 8, 9, 10])
result = arr + arr2  # 向量化运算
print(f"加法结果: {result}")

# 多维数组操作
matrix = np.random.rand(3, 3)  # 3x3随机矩阵
print(f"矩阵:\n{matrix}")
print(f"矩阵转置:\n{matrix.T}")

Pandas:数据处理利器

import pandas as pd

# 创建DataFrame
data = {
    '姓名': ['张三', '李四', '王五'],
    '年龄': [25, 30, 28],
    '城市': ['北京', '上海', '广州']
}
df = pd.DataFrame(data)
print("原始数据:")
print(df)

# 数据筛选
print("\n年龄大于26的记录:")
print(df[df['年龄'] > 26])

# 数据聚合
print("\n按城市统计人数:")
print(df.groupby('城市').size())

第二部分:数据清洗与预处理进阶技巧

2.1 处理缺失值的高级策略

import pandas as pd
import numpy as np

# 创建包含缺失值的数据集
data = {
    '销售额': [100, 200, np.nan, 400, 500],
    '利润率': [0.1, np.nan, 0.15, 0.2, 0.18],
    '客户数': [10, 15, 20, np.nan, 25]
}
df = pd.DataFrame(data)

print("原始数据:")
print(df)

# 方法1:删除缺失值
df_drop = df.dropna()
print("\n删除缺失值后:")
print(df_drop)

# 方法2:填充缺失值
# 使用均值填充
df_fill_mean = df.fillna(df.mean())
print("\n用均值填充缺失值:")
print(df_fill_mean)

# 方法3:使用插值法
df_interpolate = df.interpolate(method='linear')
print("\n线性插值填充:")
print(df_interpolate)

# 方法4:使用前向/后向填充
df_ffill = df.fillna(method='ffill')  # 前向填充
print("\n前向填充:")
print(df_ffill)

2.2 数据类型转换与优化

# 优化内存使用
def optimize_memory(df):
    """优化DataFrame内存使用"""
    start_mem = df.memory_usage().sum() / 1024**2
    print(f"原始内存使用: {start_mem:.2f} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    
    end_mem = df.memory_usage().sum() / 1024**2
    print(f"优化后内存使用: {end_mem:.2f} MB")
    print(f"内存减少: {(start_mem - end_mem) / start_mem * 100:.2f}%")
    return df

# 示例
df_large = pd.DataFrame({
    'id': range(100000),
    'value': np.random.randn(100000),
    'category': np.random.choice(['A', 'B', 'C'], 100000)
})
df_optimized = optimize_memory(df_large)

2.3 异常值检测与处理

import numpy as np
import pandas as pd
from scipy import stats

# 创建包含异常值的数据
np.random.seed(42)
data = np.random.normal(0, 1, 1000)
# 添加一些异常值
data = np.append(data, [10, -10, 15, -15])
df = pd.DataFrame({'value': data})

# 方法1:Z-score方法
def detect_outliers_zscore(df, column, threshold=3):
    """使用Z-score检测异常值"""
    z_scores = np.abs(stats.zscore(df[column]))
    outliers = df[z_scores > threshold]
    return outliers

# 方法2:IQR方法
def detect_outliers_iqr(df, column):
    """使用IQR方法检测异常值"""
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

# 检测异常值
outliers_z = detect_outliers_zscore(df, 'value')
outliers_iqr = detect_outliers_iqr(df, 'value')

print(f"Z-score方法检测到的异常值数量: {len(outliers_z)}")
print(f"IQR方法检测到的异常值数量: {len(outliers_iqr)}")

# 处理异常值
def handle_outliers(df, column, method='clip'):
    """处理异常值"""
    if method == 'clip':
        # 截断异常值
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[column] = df[column].clip(lower_bound, upper_bound)
    elif method == 'remove':
        # 删除异常值
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# 处理异常值
df_clean = handle_outliers(df.copy(), 'value', method='clip')
print(f"处理前数据范围: [{df['value'].min():.2f}, {df['value'].max():.2f}]")
print(f"处理后数据范围: [{df_clean['value'].min():.2f}, {df_clean['value'].max():.2f}]")

第三部分:数据可视化进阶技巧

3.1 Matplotlib高级绘图

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# 设置中文字体(如果需要显示中文)
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False    # 用来正常显示负号

# 创建示例数据
np.random.seed(42)
x = np.linspace(0, 10, 100)
y1 = np.sin(x)
y2 = np.cos(x)
y3 = np.sin(x) * np.cos(x)

# 创建子图
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 子图1:折线图
axes[0, 0].plot(x, y1, 'b-', linewidth=2, label='sin(x)')
axes[0, 0].plot(x, y2, 'r--', linewidth=2, label='cos(x)')
axes[0, 0].set_title('正弦和余弦函数')
axes[0, 0].set_xlabel('x')
axes[0, 0].set_ylabel('y')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 子图2:散点图
scatter_data = pd.DataFrame({
    'x': np.random.randn(100),
    'y': np.random.randn(100),
    'size': np.random.randint(10, 100, 100),
    'color': np.random.rand(100)
})
axes[0, 1].scatter(scatter_data['x'], scatter_data['y'], 
                   s=scatter_data['size'], 
                   c=scatter_data['color'], 
                   alpha=0.6, 
                   cmap='viridis')
axes[0, 1].set_title('散点图(大小和颜色映射)')
axes[0, 1].set_xlabel('X轴')
axes[0, 1].set_ylabel('Y轴')

# 子图3:柱状图
categories = ['A', 'B', 'C', 'D', 'E']
values1 = np.random.randint(10, 50, 5)
values2 = np.random.randint(10, 50, 5)
x_pos = np.arange(len(categories))
width = 0.35
axes[1, 0].bar(x_pos - width/2, values1, width, label='组1', alpha=0.8)
axes[1, 0].bar(x_pos + width/2, values2, width, label='组2', alpha=0.8)
axes[1, 0].set_xticks(x_pos)
axes[1, 0].set_xticklabels(categories)
axes[1, 0].set_title('分组柱状图')
axes[1, 0].legend()

# 子图4:箱线图
box_data = [np.random.normal(0, std, 100) for std in [1, 2, 3]]
axes[1, 1].boxplot(box_data, labels=['标准差=1', '标准差=2', '标准差=3'])
axes[1, 1].set_title('箱线图比较')
axes[1, 1].set_ylabel('数值')

plt.tight_layout()
plt.show()

3.2 Seaborn高级可视化

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# 设置Seaborn样式
sns.set_style("whitegrid")
sns.set_palette("husl")

# 创建示例数据集
np.random.seed(42)
data = pd.DataFrame({
    'category': np.random.choice(['A', 'B', 'C', 'D'], 200),
    'value1': np.random.randn(200),
    'value2': np.random.randn(200) + 1,
    'group': np.random.choice(['X', 'Y'], 200)
})

# 1. 分布图
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 小提琴图
sns.violinplot(data=data, x='category', y='value1', ax=axes[0, 0])
axes[0, 0].set_title('小提琴图')

# 箱线图与散点图结合
sns.boxplot(data=data, x='category', y='value1', ax=axes[0, 1])
sns.stripplot(data=data, x='category', y='value1', ax=axes[0, 1], 
              color='black', alpha=0.3, jitter=True)
axes[0, 1].set_title('箱线图+散点图')

# 热力图
corr_matrix = data[['value1', 'value2']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=axes[1, 0])
axes[1, 0].set_title('相关性热力图')

# 联合分布图
sns.jointplot(data=data, x='value1', y='value2', kind='scatter', 
              height=6, ratio=4, color='b')
plt.suptitle('联合分布图', y=1.02)

plt.tight_layout()
plt.show()

3.3 交互式可视化(Plotly)

import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np

# 创建示例数据
np.random.seed(42)
df = pd.DataFrame({
    'x': np.random.randn(100),
    'y': np.random.randn(100),
    'size': np.random.randint(10, 100, 100),
    'category': np.random.choice(['A', 'B', 'C'], 100),
    'value': np.random.randn(100)
})

# 1. 散点图(交互式)
fig1 = px.scatter(df, x='x', y='y', color='category', size='size',
                  hover_data=['value'], title='交互式散点图')
fig1.show()

# 2. 3D散点图
fig2 = px.scatter_3d(df, x='x', y='y', z='value', color='category',
                     size='size', title='3D散点图')
fig2.show()

# 3. 热力图
# 创建相关性矩阵
corr_matrix = df[['x', 'y', 'value']].corr()
fig3 = px.imshow(corr_matrix, text_auto=True, aspect='auto',
                 title='相关性热力图')
fig3.show()

# 4. 动态时间序列图
# 创建时间序列数据
dates = pd.date_range('2023-01-01', periods=100, freq='D')
ts_data = pd.DataFrame({
    'date': dates,
    'value': np.cumsum(np.random.randn(100)),
    'category': np.random.choice(['A', 'B'], 100)
})

fig4 = px.line(ts_data, x='date', y='value', color='category',
               title='时间序列动态图')
fig4.show()

第四部分:统计分析与假设检验

4.1 描述性统计分析

import pandas as pd
import numpy as np
from scipy import stats

# 创建示例数据
np.random.seed(42)
data = pd.DataFrame({
    'group': np.random.choice(['A', 'B', 'C'], 200),
    'value': np.random.randn(200),
    'score': np.random.randint(50, 100, 200)
})

# 基本统计量
print("基本统计量:")
print(data.describe())

# 按组统计
print("\n按组统计:")
group_stats = data.groupby('group').agg({
    'value': ['mean', 'std', 'min', 'max', 'count'],
    'score': ['mean', 'std', 'min', 'max']
})
print(group_stats)

# 偏度和峰度
print("\n偏度和峰度:")
print(f"值的偏度: {stats.skew(data['value']):.4f}")
print(f"值的峰度: {stats.kurtosis(data['value']):.4f}")

4.2 假设检验

from scipy import stats
import numpy as np

# 1. T检验(独立样本)
group_a = np.random.normal(100, 15, 50)
group_b = np.random.normal(105, 15, 50)

# 独立样本T检验
t_stat, p_value = stats.ttest_ind(group_a, group_b)
print(f"独立样本T检验:")
print(f"T统计量: {t_stat:.4f}")
print(f"P值: {p_value:.4f}")
print(f"显著性水平α=0.05下: {'显著' if p_value < 0.05 else '不显著'}")

# 2. 方差分析(ANOVA)
group_c = np.random.normal(110, 15, 50)
f_stat, p_value = stats.f_oneway(group_a, group_b, group_c)
print(f"\n单因素方差分析:")
print(f"F统计量: {f_stat:.4f}")
print(f"P值: {p_value:.4f}")

# 3. 卡方检验
observed = np.array([[10, 20, 30], [20, 30, 40]])
chi2, p, dof, expected = stats.chi2_contingency(observed)
print(f"\n卡方检验:")
print(f"卡方统计量: {chi2:.4f}")
print(f"P值: {p:.4f}")
print(f"期望频数:\n{expected}")

# 4. 相关性检验
x = np.random.randn(100)
y = 2 * x + np.random.randn(100) * 0.5
corr, p_value = stats.pearsonr(x, y)
print(f"\n皮尔逊相关系数:")
print(f"相关系数: {corr:.4f}")
print(f"P值: {p_value:.4f}")

4.3 多元统计分析

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# 创建示例数据
np.random.seed(42)
data = pd.DataFrame({
    'feature1': np.random.randn(100),
    'feature2': np.random.randn(100) + 1,
    'feature3': np.random.randn(100) * 2,
    'feature4': np.random.randn(100) * 0.5,
    'target': np.random.choice([0, 1], 100)
})

# 主成分分析(PCA)
X = data[['feature1', 'feature2', 'feature3', 'feature4']]
y = data['target']

# 标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 应用PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

print(f"PCA解释方差比例: {pca.explained_variance_ratio_}")
print(f"累计解释方差: {np.cumsum(pca.explained_variance_ratio_)}")

# 可视化PCA结果
plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.7)
plt.colorbar(scatter, label='目标类别')
plt.xlabel('主成分1')
plt.ylabel('主成分2')
plt.title('PCA降维可视化')
plt.grid(True, alpha=0.3)
plt.show()

第五部分:机器学习基础与应用

5.1 特征工程进阶

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import PolynomialFeatures

# 创建示例数据
np.random.seed(42)
data = pd.DataFrame({
    'age': np.random.randint(18, 65, 100),
    'income': np.random.randint(20000, 100000, 100),
    'education': np.random.choice(['高中', '本科', '硕士', '博士'], 100),
    'city': np.random.choice(['北京', '上海', '广州', '深圳'], 100),
    'purchased': np.random.choice([0, 1], 100)
})

print("原始数据:")
print(data.head())

# 1. 类别特征编码
# 标签编码
le = LabelEncoder()
data['education_encoded'] = le.fit_transform(data['education'])
print("\n标签编码后:")
print(data[['education', 'education_encoded']].head())

# 独热编码
encoder = OneHotEncoder(sparse_output=False)
education_encoded = encoder.fit_transform(data[['education']])
education_df = pd.DataFrame(education_encoded, 
                           columns=encoder.get_feature_names_out(['education']))
data = pd.concat([data, education_df], axis=1)
print("\n独热编码后:")
print(data.head())

# 2. 数值特征分箱
data['age_group'] = pd.cut(data['age'], bins=[18, 30, 45, 65], 
                          labels=['青年', '中年', '老年'])
print("\n年龄分箱后:")
print(data[['age', 'age_group']].head())

# 3. 多项式特征
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(data[['age', 'income']])
poly_features = poly.get_feature_names_out(['age', 'income'])
poly_df = pd.DataFrame(X_poly, columns=poly_features)
print("\n多项式特征:")
print(poly_df.head())

# 4. 特征选择
X = data[['age', 'income', 'education_encoded'] + list(education_df.columns)]
y = data['purchased']

# 使用卡方检验选择特征
selector = SelectKBest(score_func=f_classif, k=3)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print(f"\n选择的特征: {list(selected_features)}")

5.2 模型训练与评估

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# 准备数据
X = data[['age', 'income', 'education_encoded'] + list(education_df.columns)]
y = data['purchased']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练随机森林模型
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# 预测
y_pred = rf.predict(X_test)

# 评估
accuracy = accuracy_score(y_test, y_pred)
print(f"模型准确率: {accuracy:.4f}")

# 交叉验证
cv_scores = cross_val_score(rf, X, y, cv=5)
print(f"交叉验证得分: {cv_scores}")
print(f"平均交叉验证得分: {cv_scores.mean():.4f}")

# 分类报告
print("\n分类报告:")
print(classification_report(y_test, y_pred))

# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('混淆矩阵')
plt.xlabel('预测值')
plt.ylabel('真实值')
plt.show()

第六部分:真实项目案例解析

6.1 电商销售数据分析项目

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# 模拟电商销售数据
np.random.seed(42)
n_orders = 1000

# 生成订单数据
orders = pd.DataFrame({
    'order_id': range(1, n_orders + 1),
    'customer_id': np.random.randint(1, 500, n_orders),
    'product_id': np.random.randint(1, 100, n_orders),
    'quantity': np.random.randint(1, 10, n_orders),
    'price': np.random.uniform(10, 500, n_orders).round(2),
    'order_date': pd.date_range('2023-01-01', periods=n_orders, freq='H'),
    'category': np.random.choice(['电子产品', '服装', '食品', '家居'], n_orders),
    'region': np.random.choice(['华北', '华东', '华南', '西南'], n_orders)
})

# 计算销售额
orders['revenue'] = orders['quantity'] * orders['price']

print("数据概览:")
print(orders.head())
print(f"\n数据形状: {orders.shape}")
print(f"\n基本统计:")
print(orders.describe())

# 1. 销售趋势分析
# 按月汇总
orders['month'] = orders['order_date'].dt.to_period('M')
monthly_sales = orders.groupby('month')['revenue'].sum().reset_index()
monthly_sales['month'] = monthly_sales['month'].astype(str)

plt.figure(figsize=(12, 6))
plt.plot(monthly_sales['month'], monthly_sales['revenue'], marker='o', linewidth=2)
plt.title('月度销售趋势')
plt.xlabel('月份')
plt.ylabel('销售额')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# 2. 品类销售分析
category_sales = orders.groupby('category')['revenue'].sum().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=category_sales.index, y=category_sales.values, palette='viridis')
plt.title('各品类销售额')
plt.xlabel('品类')
plt.ylabel('销售额')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 3. 地区销售分析
region_sales = orders.groupby('region')['revenue'].sum()
plt.figure(figsize=(8, 8))
plt.pie(region_sales.values, labels=region_sales.index, autopct='%1.1f%%', 
        startangle=90, colors=sns.color_palette('pastel'))
plt.title('各地区销售额占比')
plt.show()

# 4. 客户价值分析(RFM分析)
# 计算最近购买时间、购买频率、购买金额
customer_analysis = orders.groupby('customer_id').agg({
    'order_date': 'max',  # 最近购买时间
    'order_id': 'count',  # 购买频率
    'revenue': 'sum'      # 购买金额
}).reset_index()

customer_analysis.columns = ['customer_id', 'last_purchase', 'frequency', 'monetary']
customer_analysis['recency'] = (datetime(2023, 12, 31) - customer_analysis['last_purchase']).dt.days

# RFM评分
customer_analysis['R_score'] = pd.qcut(customer_analysis['recency'], 5, labels=[5, 4, 3, 2, 1])
customer_analysis['F_score'] = pd.qcut(customer_analysis['frequency'].rank(method='first'), 5, labels=[1, 2, 3, 4, 5])
customer_analysis['M_score'] = pd.qcut(customer_analysis['monetary'], 5, labels=[1, 2, 3, 4, 5])

# 合并RFM分数
customer_analysis['RFM_score'] = customer_analysis['R_score'].astype(str) + \
                                customer_analysis['F_score'].astype(str) + \
                                customer_analysis['M_score'].astype(str)

print("\n客户价值分析(RFM):")
print(customer_analysis.head())

# 5. 关联规则分析(模拟)
# 使用Apriori算法(需要安装mlxtend库)
try:
    from mlxtend.frequent_patterns import apriori, association_rules
    
    # 准备关联规则数据
    basket = orders.groupby(['order_id', 'category'])['quantity'].sum().unstack().fillna(0)
    basket = basket.applymap(lambda x: 1 if x > 0 else 0)
    
    # 查找频繁项集
    frequent_itemsets = apriori(basket, min_support=0.01, use_colnames=True)
    
    # 生成关联规则
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
    
    print("\n关联规则分析(前5条):")
    print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head())
    
except ImportError:
    print("\n提示: 需要安装mlxtend库: pip install mlxtend")

6.2 金融风控数据分析项目

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# 模拟金融风控数据
np.random.seed(42)
n_samples = 10000

# 生成特征
data = pd.DataFrame({
    'age': np.random.randint(18, 70, n_samples),
    'income': np.random.lognormal(10, 1, n_samples),
    'credit_score': np.random.randint(300, 850, n_samples),
    'debt_ratio': np.random.uniform(0, 0.8, n_samples),
    'monthly_expenses': np.random.uniform(1000, 10000, n_samples),
    'employment_years': np.random.randint(0, 30, n_samples),
    'loan_amount': np.random.lognormal(8, 1, n_samples),
    'loan_term': np.random.choice([12, 24, 36, 48, 60], n_samples),
    'is_default': np.random.choice([0, 1], n_samples, p=[0.85, 0.15])
})

# 特征工程
# 1. 收入与支出比
data['income_expense_ratio'] = data['income'] / data['monthly_expenses']

# 2. 负债收入比
data['debt_income_ratio'] = data['debt_ratio'] * data['income']

# 3. 贷款金额与收入比
data['loan_income_ratio'] = data['loan_amount'] / data['income']

# 4. 年龄分组
data['age_group'] = pd.cut(data['age'], bins=[18, 30, 45, 60, 70], 
                          labels=['青年', '中年', '老年', '高龄'])

# 5. 信用评分分组
data['credit_group'] = pd.cut(data['credit_score'], bins=[300, 580, 660, 740, 850], 
                             labels=['差', '一般', '良好', '优秀'])

print("数据概览:")
print(data.head())
print(f"\n违约率: {data['is_default'].mean():.2%}")

# 准备建模数据
X = data.drop(['is_default', 'age_group', 'credit_group'], axis=1)
y = data['is_default']

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 训练模型
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)

# 预测
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
y_pred = rf_model.predict(X_test)

# 评估
auc = roc_auc_score(y_test, y_pred_proba)
print(f"\n模型AUC: {auc:.4f}")

# 绘制ROC曲线
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f'ROC曲线 (AUC = {auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--', label='随机猜测')
plt.xlabel('假正率')
plt.ylabel('真正率')
plt.title('ROC曲线')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# 特征重要性
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance, palette='viridis')
plt.title('特征重要性')
plt.xlabel('重要性')
plt.tight_layout()
plt.show()

print("\n特征重要性排序:")
print(feature_importance)

6.3 用户行为分析项目

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# 模拟用户行为数据
np.random.seed(42)
n_users = 1000

# 生成用户行为数据
user_data = pd.DataFrame({
    'user_id': range(1, n_users + 1),
    'session_count': np.random.poisson(5, n_users),
    'avg_session_duration': np.random.exponential(300, n_users),
    'page_views': np.random.poisson(20, n_users),
    'clicks': np.random.poisson(10, n_users),
    'purchase_count': np.random.poisson(2, n_users),
    'avg_purchase_value': np.random.lognormal(3, 0.5, n_users),
    'days_since_last_visit': np.random.exponential(30, n_users)
})

# 特征工程
# 1. 活跃度评分
user_data['activity_score'] = (user_data['session_count'] * 0.3 + 
                              user_data['page_views'] * 0.2 + 
                              user_data['clicks'] * 0.2 + 
                              user_data['purchase_count'] * 0.3)

# 2. 价值评分
user_data['value_score'] = (user_data['purchase_count'] * 0.6 + 
                           user_data['avg_purchase_value'] * 0.4)

# 3. 活跃度分组
user_data['activity_level'] = pd.qcut(user_data['activity_score'], 5, 
                                     labels=['极低', '低', '中', '高', '极高'])

# 4. 价值分组
user_data['value_level'] = pd.qcut(user_data['value_score'], 5, 
                                  labels=['低价值', '中低价值', '中价值', '中高价值', '高价值'])

print("用户数据概览:")
print(user_data.head())

# 用户分群(K-means)
# 选择特征
features = ['session_count', 'avg_session_duration', 'page_views', 
            'clicks', 'purchase_count', 'avg_purchase_value', 'days_since_last_visit']

X = user_data[features]

# 标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 确定最佳K值(肘部法则)
inertia = []
K_range = range(1, 11)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(K_range, inertia, marker='o')
plt.title('肘部法则确定最佳K值')
plt.xlabel('K值')
plt.ylabel('惯性')
plt.grid(True, alpha=0.3)
plt.show()

# 选择K=4进行聚类
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
user_data['cluster'] = kmeans.fit_predict(X_scaled)

# 分析聚类结果
cluster_analysis = user_data.groupby('cluster').agg({
    'session_count': 'mean',
    'avg_session_duration': 'mean',
    'page_views': 'mean',
    'clicks': 'mean',
    'purchase_count': 'mean',
    'avg_purchase_value': 'mean',
    'days_since_last_visit': 'mean',
    'user_id': 'count'
}).reset_index()

print("\n聚类分析结果:")
print(cluster_analysis)

# 可视化聚类结果
plt.figure(figsize=(12, 8))
sns.scatterplot(data=user_data, x='session_count', y='avg_purchase_value', 
                hue='cluster', palette='viridis', size='purchase_count', 
                sizes=(20, 200), alpha=0.7)
plt.title('用户分群可视化')
plt.xlabel('会话次数')
plt.ylabel('平均购买价值')
plt.legend(title='聚类')
plt.grid(True, alpha=0.3)
plt.show()

# 给每个聚类命名
cluster_names = {
    0: '低活跃低价值',
    1: '高活跃中价值',
    2: '中活跃高价值',
    3: '低活跃高价值'
}
user_data['cluster_name'] = user_data['cluster'].map(cluster_names)

print("\n用户分群特征描述:")
for cluster_id, name in cluster_names.items():
    cluster_data = user_data[user_data['cluster'] == cluster_id]
    print(f"\n{name} (用户数: {len(cluster_data)}):")
    print(f"  平均会话次数: {cluster_data['session_count'].mean():.1f}")
    print(f"  平均购买次数: {cluster_data['purchase_count'].mean():.1f}")
    print(f"  平均购买价值: {cluster_data['avg_purchase_value'].mean():.1f}")

第七部分:性能优化与大数据处理

7.1 Pandas性能优化技巧

import pandas as pd
import numpy as np
import time

# 1. 使用向量化操作替代循环
def vectorized_vs_loop():
    """比较向量化操作和循环的性能"""
    n = 1000000
    arr = np.random.randn(n)
    
    # 循环方式
    start = time.time()
    result_loop = np.zeros(n)
    for i in range(n):
        result_loop[i] = arr[i] ** 2 + 2 * arr[i] + 1
    time_loop = time.time() - start
    
    # 向量化方式
    start = time.time()
    result_vectorized = arr ** 2 + 2 * arr + 1
    time_vectorized = time.time() - start
    
    print(f"循环方式耗时: {time_loop:.4f}秒")
    print(f"向量化方式耗时: {time_vectorized:.4f}秒")
    print(f"性能提升: {time_loop / time_vectorized:.2f}倍")
    
    return result_loop, result_vectorized

# 2. 使用apply的替代方案
def apply_alternatives():
    """apply的替代方案"""
    df = pd.DataFrame({
        'a': np.random.randn(100000),
        'b': np.random.randn(100000)
    })
    
    # 使用apply(慢)
    start = time.time()
    df['result_apply'] = df.apply(lambda row: row['a'] + row['b'], axis=1)
    time_apply = time.time() - start
    
    # 使用向量化(快)
    start = time.time()
    df['result_vectorized'] = df['a'] + df['b']
    time_vectorized = time.time() - start
    
    print(f"apply耗时: {time_apply:.4f}秒")
    print(f"向量化耗时: {time_vectorized:.4f}秒")
    print(f"性能提升: {time_apply / time_vectorized:.2f}倍")

# 3. 使用eval和query
def eval_query_example():
    """eval和query的使用"""
    df = pd.DataFrame({
        'a': np.random.randn(100000),
        'b': np.random.randn(100000),
        'c': np.random.randn(100000)
    })
    
    # 普通方式
    start = time.time()
    result1 = df[(df['a'] > 0) & (df['b'] < 0) & (df['c'] > 0.5)]
    time1 = time.time() - start
    
    # 使用query
    start = time.time()
    result2 = df.query('a > 0 and b < 0 and c > 0.5')
    time2 = time.time() - start
    
    print(f"普通方式耗时: {time1:.4f}秒")
    print(f"query方式耗时: {time2:.4f}秒")
    print(f"性能提升: {time1 / time2:.2f}倍")
    
    # 使用eval进行计算
    start = time.time()
    df['new_col'] = df.eval('a * b + c')
    time_eval = time.time() - start
    
    start = time.time()
    df['new_col_manual'] = df['a'] * df['b'] + df['c']
    time_manual = time.time() - start
    
    print(f"\neval计算耗时: {time_eval:.4f}秒")
    print(f"手动计算耗时: {time_manual:.4f}秒")
    print(f"性能提升: {time_manual / time_eval:.2f}倍")

# 4. 使用category类型优化内存
def optimize_with_category():
    """使用category类型优化内存"""
    df = pd.DataFrame({
        'id': range(100000),
        'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], 100000),
        'value': np.random.randn(100000)
    })
    
    print(f"原始内存使用: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # 转换为category
    df['category'] = df['category'].astype('category')
    
    print(f"转换后内存使用: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print(f"内存减少: {(1 - df.memory_usage(deep=True).sum() / 1024**2 / (100000 * 100 / 1024**2)) * 100:.2f}%")

7.2 大数据处理技巧

import pandas as pd
import numpy as np
import dask.dataframe as dd
import time

# 1. 使用Dask处理大数据
def dask_example():
    """Dask处理大数据示例"""
    # 创建模拟大数据
    np.random.seed(42)
    n_rows = 10000000  # 1000万行
    
    # 生成数据并保存为CSV
    print("生成大数据文件...")
    chunk_size = 1000000
    for i in range(10):
        df_chunk = pd.DataFrame({
            'id': range(i * chunk_size, (i + 1) * chunk_size),
            'value': np.random.randn(chunk_size),
            'category': np.random.choice(['A', 'B', 'C'], chunk_size)
        })
        df_chunk.to_csv(f'data_chunk_{i}.csv', index=False)
    
    # 使用Dask读取
    print("使用Dask读取数据...")
    start = time.time()
    ddf = dd.read_csv('data_chunk_*.csv')
    time_dask = time.time() - start
    
    # 执行计算
    start = time.time()
    result = ddf.groupby('category')['value'].mean().compute()
    time_compute = time.time() - start
    
    print(f"Dask读取耗时: {time_dask:.4f}秒")
    print(f"Dask计算耗时: {time_compute:.4f}秒")
    print(f"结果:\n{result}")
    
    # 清理文件
    import os
    for i in range(10):
        os.remove(f'data_chunk_{i}.csv')

# 2. 使用分块处理
def chunked_processing():
    """分块处理大数据"""
    # 模拟大数据文件
    np.random.seed(42)
    n_rows = 5000000
    
    # 生成数据
    print("生成大数据...")
    df = pd.DataFrame({
        'id': range(n_rows),
        'value': np.random.randn(n_rows),
        'category': np.random.choice(['A', 'B', 'C'], n_rows)
    })
    
    # 保存为CSV
    df.to_csv('large_data.csv', index=False)
    
    # 分块处理
    chunk_size = 100000
    results = []
    
    start = time.time()
    for chunk in pd.read_csv('large_data.csv', chunksize=chunk_size):
        # 对每个分块进行处理
        chunk_result = chunk.groupby('category')['value'].mean()
        results.append(chunk_result)
    
    # 合并结果
    final_result = pd.concat(results).groupby(level=0).mean()
    time_chunked = time.time() - start
    
    print(f"分块处理耗时: {time_chunked:.4f}秒")
    print(f"最终结果:\n{final_result}")
    
    # 清理文件
    import os
    os.remove('large_data.csv')

第八部分:实战项目综合案例

8.1 电商用户流失预测项目

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# 模拟电商用户数据
np.random.seed(42)
n_users = 5000

# 生成用户特征
data = pd.DataFrame({
    'user_id': range(1, n_users + 1),
    'age': np.random.randint(18, 65, n_users),
    'gender': np.random.choice(['M', 'F'], n_users),
    'region': np.random.choice(['华北', '华东', '华南', '西南', '西北'], n_users),
    'membership_duration': np.random.randint(1, 36, n_users),
    'total_purchases': np.random.poisson(10, n_users),
    'avg_purchase_value': np.random.lognormal(3, 0.5, n_users),
    'last_purchase_days': np.random.exponential(30, n_users),
    'visit_frequency': np.random.exponential(0.5, n_users),
    'cart_abandonment_rate': np.random.uniform(0, 0.8, n_users),
    'coupon_usage': np.random.choice([0, 1], n_users, p=[0.7, 0.3]),
    'churn': np.random.choice([0, 1], n_users, p=[0.8, 0.2])  # 20%流失率
})

print("数据概览:")
print(data.head())
print(f"\n流失率: {data['churn'].mean():.2%}")

# 特征工程
# 1. 数值特征分箱
data['age_group'] = pd.cut(data['age'], bins=[18, 25, 35, 45, 55, 65], 
                          labels=['18-25', '26-35', '36-45', '46-55', '56-65'])
data['membership_group'] = pd.cut(data['membership_duration'], bins=[0, 6, 12, 24, 36], 
                                 labels=['新用户', '成长用户', '成熟用户', '老用户'])
data['purchase_group'] = pd.cut(data['total_purchases'], bins=[0, 5, 15, 30, 100], 
                               labels=['低频', '中频', '高频', '极高频'])

# 2. 类别特征编码
# 独热编码
region_encoded = pd.get_dummies(data['region'], prefix='region')
gender_encoded = pd.get_dummies(data['gender'], prefix='gender')
data = pd.concat([data, region_encoded, gender_encoded], axis=1)

# 3. 创建新特征
data['value_per_purchase'] = data['avg_purchase_value'] / (data['total_purchases'] + 1)
data['recency_score'] = 1 / (data['last_purchase_days'] + 1)
data['engagement_score'] = data['visit_frequency'] * (1 - data['cart_abandonment_rate'])

print("\n特征工程后的数据:")
print(data.head())

# 准备建模数据
feature_cols = [col for col in data.columns if col not in ['user_id', 'churn', 'age', 'region', 'gender', 
                                                          'age_group', 'membership_group', 'purchase_group']]
X = data[feature_cols]
y = data['churn']

print(f"\n特征数量: {len(feature_cols)}")
print(f"特征列表: {feature_cols}")

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 模型训练与调参
# 1. 随机森林
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='roc_auc', n_jobs=-1)
grid_rf.fit(X_train_scaled, y_train)

print(f"\n随机森林最佳参数: {grid_rf.best_params_}")
print(f"随机森林最佳得分: {grid_rf.best_score_:.4f}")

# 2. 梯度提升树
gb = GradientBoostingClassifier(random_state=42)
param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

grid_gb = GridSearchCV(gb, param_grid_gb, cv=5, scoring='roc_auc', n_jobs=-1)
grid_gb.fit(X_train_scaled, y_train)

print(f"\n梯度提升树最佳参数: {grid_gb.best_params_}")
print(f"梯度提升树最佳得分: {grid_gb.best_score_:.4f}")

# 选择最佳模型
best_model = grid_rf if grid_rf.best_score_ > grid_gb.best_score_ else grid_gb
print(f"\n选择的最佳模型: {type(best_model.best_estimator_).__name__}")

# 模型评估
y_pred = best_model.predict(X_test_scaled)
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

print("\n模型评估指标:")
print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")
print(f"精确率: {precision_score(y_test, y_pred):.4f}")
print(f"召回率: {recall_score(y_test, y_pred):.4f}")
print(f"F1分数: {f1_score(y_test, y_pred):.4f}")
print(f"AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")

# 特征重要性
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': best_model.best_estimator_.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance.head(15), palette='viridis')
plt.title('特征重要性(前15个)')
plt.xlabel('重要性')
plt.tight_layout()
plt.show()

print("\n特征重要性排序(前10个):")
print(feature_importance.head(10))

# 模型解释(SHAP值)
try:
    import shap
    
    # 计算SHAP值
    explainer = shap.TreeExplainer(best_model.best_estimator_)
    shap_values = explainer.shap_values(X_test_scaled)
    
    # 可视化
    plt.figure(figsize=(10, 6))
    shap.summary_plot(shap_values[1], X_test_scaled, feature_names=feature_cols, show=False)
    plt.title('SHAP值汇总图')
    plt.tight_layout()
    plt.show()
    
    print("\nSHAP分析完成")
    
except ImportError:
    print("\n提示: 需要安装shap库: pip install shap")

# 模型部署准备
print("\n模型部署准备:")
print(f"1. 保存模型: 使用joblib或pickle")
print(f"2. 创建API: 使用Flask或FastAPI")
print(f"3. 监控模型性能: 定期评估模型表现")
print(f"4. 模型更新: 当性能下降时重新训练")

第九部分:持续学习与资源推荐

9.1 学习路径建议

  1. 基础阶段(1-2个月)

    • Python基础语法
    • NumPy、Pandas基础操作
    • Matplotlib、Seaborn基础可视化
    • 简单的数据清洗和预处理
  2. 进阶阶段(3-4个月)

    • 高级数据处理技巧
    • 统计分析与假设检验
    • 机器学习基础
    • 特征工程
    • 模型评估与调参
  3. 精通阶段(6个月以上)

    • 深度学习基础
    • 大数据处理(Spark、Dask)
    • 模型部署与运维
    • 领域专业知识(金融、医疗、电商等)
    • 项目实战经验

9.2 推荐学习资源

在线课程

  • Coursera: “Applied Data Science with Python” 专项课程
  • edX: “Data Science MicroMasters” 项目
  • DataCamp: Python数据分析课程
  • Kaggle Learn: 免费的数据科学课程

书籍推荐

  • 《Python数据分析》(Wes McKinney著)
  • 《利用Python进行数据分析》(Wes McKinney著)
  • 《统计学习方法》(李航著)
  • 《机器学习》(周志华著)
  • 《Python机器学习手册》(Chris Albon著)

实战平台

  • Kaggle: 参与数据科学竞赛
  • GitHub: 查看优秀项目代码
  • 天池大数据竞赛: 国内数据科学竞赛平台
  • 阿里云天池: 提供真实数据集和竞赛

社区与论坛

  • Stack Overflow: 技术问题解答
  • Reddit: r/datascience, r/learnpython
  • 知乎: 数据科学相关话题
  • CSDN: 国内技术博客平台

9.3 职业发展建议

  1. 明确职业方向

    • 数据分析师
    • 数据科学家
    • 机器学习工程师
    • 商业智能分析师
    • 数据工程师
  2. 构建作品集

    • GitHub项目展示
    • 技术博客写作
    • Kaggle竞赛成绩
    • 开源项目贡献
  3. 持续学习

    • 关注行业动态
    • 参加技术会议
    • 获取专业认证
    • 建立专业人脉

结语

Python数据分析是一个不断发展的领域,从基础的数据处理到高级的机器学习应用,每一步都需要扎实的理论基础和丰富的实践经验。通过本课程的学习,你已经掌握了从入门到精通的完整知识体系,并通过多个真实项目案例进行了实战演练。

记住,数据分析的核心不仅仅是技术,更重要的是业务理解问题解决能力。技术只是工具,真正的价值在于如何用数据驱动决策,解决实际问题。

持续学习,不断实践,保持好奇心,你将在数据分析的道路上走得更远。祝你在数据分析的旅程中取得成功!