引言:数学与生物学的交汇点
数学建模作为连接抽象数学理论与现实世界问题的桥梁,在生物学领域发挥着越来越重要的作用。从预测传染病的传播模式到解析复杂的基因调控网络,数学模型为生物学家提供了定量分析和预测的强大工具。本文将深入探讨数学建模在生物学中的应用,特别是从疾病传播到基因调控这两个关键领域,并分析其中的跨学科挑战与未来发展方向。
第一部分:数学建模在疾病传播中的应用
1.1 经典传染病模型:SIR模型及其扩展
传染病建模是数学在生物学中最成功的应用之一。最经典的SIR模型将人群分为三类:易感者(Susceptible)、感染者(Infected)和康复者(Recovered)。模型的基本微分方程如下:
# SIR模型的Python实现示例
import numpy as np
from scipy.integrate import odeint
import matplotlib.pyplot as plt
def sir_model(y, t, beta, gamma):
S, I, R = y
dSdt = -beta * S * I
dIdt = beta * S * I - gamma * I
dRdt = gamma * I
return [dSdt, dIdt, dRdt]
# 参数设置
beta = 0.3 # 感染率
gamma = 0.1 # 康复率
N = 1000 # 总人口
I0, R0 = 1, 0 # 初始感染者和康复者
S0 = N - I0 - R0 # 初始易感者
# 时间点
t = np.linspace(0, 160, 160)
# 求解微分方程
solution = odeint(sir_model, [S0, I0, R0], t, args=(beta, gamma))
S, I, R = solution.T
# 绘制结果
plt.figure(figsize=(10, 6))
plt.plot(t, S, 'b', label='易感者')
plt.plot(t, I, 'r', label='感染者')
plt.plot(t, R, 'g', label='康复者')
plt.xlabel('时间(天)')
plt.ylabel('人数')
plt.title('SIR传染病模型')
plt.legend()
plt.grid(True)
plt.show()
# 计算基本再生数R0
R0_basic = beta / gamma
print(f"基本再生数 R0 = {R0_basic:.2f}")
实际应用案例:2020年COVID-19疫情期间,全球科学家使用扩展的SEIR模型(加入潜伏期)来预测疫情发展。例如,中国疾控中心使用包含年龄结构、空间异质性和干预措施的模型,成功预测了武汉疫情的峰值时间,为防控决策提供了重要依据。
1.2 空间传播模型:元胞自动机与网络模型
现实中的疾病传播具有空间特性,元胞自动机(Cellular Automata)和网络模型能够更好地模拟这种复杂性。
元胞自动机示例:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
class DiseaseSpreadCA:
def __init__(self, size=100, infection_prob=0.3, recovery_prob=0.1):
self.size = size
self.grid = np.random.choice([0, 1, 2], size=(size, size), p=[0.9, 0.09, 0.01])
# 0: 易感, 1: 感染, 2: 康复
self.infection_prob = infection_prob
self.recovery_prob = recovery_prob
def update(self):
new_grid = self.grid.copy()
for i in range(self.size):
for j in range(self.size):
if self.grid[i, j] == 1: # 如果是感染者
# 检查邻居
neighbors = []
for di in [-1, 0, 1]:
for dj in [-1, 0, 1]:
if di == 0 and dj == 0:
continue
ni, nj = (i + di) % self.size, (j + dj) % self.size
neighbors.append(self.grid[ni, nj])
# 感染易感邻居
for idx, neighbor in enumerate(neighbors):
if neighbor == 0: # 易感者
if np.random.random() < self.infection_prob:
ni, nj = (i + [-1, 0, 1, -1, 1, -1, 0, 1][idx]) % self.size, \
(j + [-1, -1, -1, 0, 0, 1, 1, 1][idx]) % self.size
new_grid[ni, nj] = 1
# 康复概率
if np.random.random() < self.recovery_prob:
new_grid[i, j] = 2
self.grid = new_grid
return self.grid
# 创建动画
fig, ax = plt.subplots(figsize=(8, 8))
ca = DiseaseSpreadCA(size=50)
im = ax.imshow(ca.grid, cmap='viridis', vmin=0, vmax=2)
def animate(frame):
ca.update()
im.set_array(ca.grid)
return [im]
ani = FuncAnimation(fig, animate, frames=100, interval=200, blit=True)
plt.title('元胞自动机模拟疾病空间传播')
plt.colorbar(ticks=[0, 1, 2], label='状态 (0:易感, 1:感染, 2:康复)')
plt.show()
网络模型应用:在COVID-19研究中,科学家使用复杂网络模型分析了超级传播事件。例如,韩国首尔的梨泰院聚集性疫情中,研究人员通过社交网络分析识别出关键传播节点,为精准防控提供了依据。
1.3 多尺度建模:从个体到群体
现代传染病建模趋向于多尺度整合,结合个体行为数据和群体传播动力学。
案例:COVID-19的多尺度建模
- 个体尺度:使用代理模型(Agent-Based Model)模拟个体的日常活动轨迹
- 社区尺度:基于人口流动数据预测区域间传播
- 国家尺度:结合医疗资源分配优化防控策略
# 简化的多尺度建模框架示例
class MultiScaleEpidemicModel:
def __init__(self, population_size=10000):
self.population = population_size
self.individuals = []
self.communities = []
def create_individuals(self):
# 创建个体,包含年龄、健康状况、行为模式等属性
for i in range(self.population):
individual = {
'id': i,
'age': np.random.randint(1, 90),
'health_status': 'susceptible',
'mobility': np.random.random(), # 移动性指数
'contacts': np.random.poisson(10) # 日均接触人数
}
self.individuals.append(individual)
def simulate_community_spread(self, days=30):
# 社区尺度传播模拟
community_data = []
for day in range(days):
daily_cases = 0
for ind in self.individuals:
if ind['health_status'] == 'infected':
# 基于接触人数和移动性计算传播概率
transmission_prob = 0.01 * ind['mobility'] * (ind['contacts'] / 10)
if np.random.random() < transmission_prob:
daily_cases += 1
# 随机选择一个邻居感染
neighbor_idx = np.random.randint(0, self.population)
if self.individuals[neighbor_idx]['health_status'] == 'susceptible':
self.individuals[neighbor_idx]['health_status'] = 'infected'
# 康复过程
for ind in self.individuals:
if ind['health_status'] == 'infected' and np.random.random() < 0.1:
ind['health_status'] = 'recovered'
community_data.append(daily_cases)
return community_data
# 运行模拟
model = MultiScaleEpidemicModel(population_size=1000)
model.create_individuals()
# 初始感染者
model.individuals[0]['health_status'] = 'infected'
results = model.simulate_community_spread(days=30)
# 可视化
plt.figure(figsize=(10, 6))
plt.plot(results, 'r-', linewidth=2)
plt.xlabel('天数')
plt.ylabel('每日新增病例')
plt.title('多尺度传染病模型模拟结果')
plt.grid(True)
plt.show()
第二部分:数学建模在基因调控中的应用
2.1 基因调控网络的基础模型
基因调控网络描述了基因、蛋白质和其他分子之间的相互作用。最常用的数学模型是常微分方程(ODE)模型。
基因表达调控的ODE模型:
import numpy as np
from scipy.integrate import solve_ivp
import matplotlib.pyplot as plt
def gene_regulation_ode(t, y, params):
"""
模拟基因表达调控的ODE模型
y[0]: mRNA浓度
y[1]: 蛋白质浓度
"""
mRNA, protein = y
alpha_m, beta_m, delta_m, alpha_p, beta_p, delta_p, K, n = params
# mRNA动力学
dm_dt = alpha_m * (1 + (protein/K)**n) / (1 + (protein/K)**n) - delta_m * mRNA
# 蛋白质动力学
dp_dt = alpha_p * mRNA - delta_p * protein
return [dm_dt, dp_dt]
# 参数设置
params = [
0.5, # alpha_m: mRNA合成速率
0.0, # beta_m: 基础转录速率(简化)
0.1, # delta_m: mRNA降解速率
0.3, # alpha_p: 蛋白质翻译速率
0.0, # beta_p: 基础翻译速率
0.05, # delta_p: 蛋白质降解速率
1.0, # K: 半饱和常数
4 # n: 协同系数
]
# 初始条件
y0 = [0.1, 0.1]
# 时间范围
t_span = (0, 100)
t_eval = np.linspace(0, 100, 1000)
# 求解ODE
solution = solve_ivp(gene_regulation_ode, t_span, y0, args=(params,), t_eval=t_eval)
# 绘制结果
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(solution.t, solution.y[0], 'b-', label='mRNA')
plt.plot(solution.t, solution.y[1], 'r-', label='蛋白质')
plt.xlabel('时间')
plt.ylabel('浓度')
plt.title('基因调控ODE模型')
plt.legend()
plt.grid(True)
# 参数敏感性分析
plt.subplot(1, 2, 2)
sensitivity_results = []
for alpha_m in np.linspace(0.1, 1.0, 20):
params[0] = alpha_m
sol = solve_ivp(gene_regulation_ode, t_span, y0, args=(params,), t_eval=t_eval)
sensitivity_results.append(sol.y[1][-1]) # 最终蛋白质浓度
plt.plot(np.linspace(0.1, 1.0, 20), sensitivity_results, 'g-')
plt.xlabel('mRNA合成速率 (alpha_m)')
plt.ylabel('最终蛋白质浓度')
plt.title('参数敏感性分析')
plt.grid(True)
plt.tight_layout()
plt.show()
2.2 布尔网络模型:离散状态下的基因调控
对于大规模基因调控网络,布尔网络模型因其计算效率高而被广泛使用。
布尔网络模型实现:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
class BooleanGeneNetwork:
def __init__(self, n_genes=10):
self.n_genes = n_genes
self.network = nx.DiGraph()
self.states = np.random.randint(0, 2, n_genes) # 随机初始状态
def create_random_network(self, edge_prob=0.3):
"""创建随机布尔网络"""
for i in range(self.n_genes):
self.network.add_node(i)
for j in range(self.n_genes):
if i != j and np.random.random() < edge_prob:
# 随机分配调控关系(激活或抑制)
regulation = np.random.choice(['activate', 'inhibit'])
self.network.add_edge(i, j, regulation=regulation)
def update_state(self, state):
"""更新基因状态"""
new_state = state.copy()
for gene in range(self.n_genes):
# 获取所有调控该基因的基因
regulators = list(self.network.predecessors(gene))
if not regulators:
# 无调控基因,保持当前状态
continue
# 计算调控效应
activation = 0
inhibition = 0
for reg in regulators:
edge_data = self.network[reg][gene]
if edge_data['regulation'] == 'activate':
activation += state[reg]
else:
inhibition += state[reg]
# 布尔逻辑:如果激活>抑制,则激活
if activation > inhibition:
new_state[gene] = 1
elif activation < inhibition:
new_state[gene] = 0
# 如果相等,保持原状态
return new_state
def simulate(self, steps=20):
"""模拟网络动态"""
trajectory = [self.states.copy()]
for _ in range(steps):
self.states = self.update_state(self.states)
trajectory.append(self.states.copy())
return np.array(trajectory)
# 创建并模拟布尔网络
bn = BooleanGeneNetwork(n_genes=15)
bn.create_random_network(edge_prob=0.4)
trajectory = bn.simulate(steps=30)
# 可视化网络结构
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
pos = nx.spring_layout(bn.network)
nx.draw(bn.network, pos, with_labels=True, node_color='lightblue',
node_size=500, arrowsize=15, font_size=10)
plt.title('基因调控网络结构')
# 可视化状态轨迹
plt.subplot(1, 3, 2)
plt.imshow(trajectory.T, aspect='auto', cmap='viridis', interpolation='nearest')
plt.xlabel('时间步')
plt.ylabel('基因')
plt.title('基因状态时间轨迹')
plt.colorbar(label='状态 (0/1)')
# 可视化吸引子
plt.subplot(1, 3, 3)
# 计算吸引子(稳定状态)
attractors = []
for i in range(trajectory.shape[0]):
if i > 0 and np.array_equal(trajectory[i], trajectory[i-1]):
attractors.append(trajectory[i])
if attractors:
attractor = attractors[0]
plt.bar(range(len(attractor)), attractor)
plt.xlabel('基因')
plt.ylabel('状态')
plt.title('吸引子(稳定状态)')
else:
plt.text(0.5, 0.5, '无稳定吸引子', ha='center', va='center', fontsize=12)
plt.tight_layout()
plt.show()
# 分析网络特性
print(f"网络节点数: {bn.network.number_of_nodes()}")
print(f"网络边数: {bn.network.number_of_edges()}")
print(f"平均入度: {np.mean([d for n, d in bn.network.in_degree()])}")
print(f"平均出度: {np.mean([d for n, d in bn.network.out_degree()])}")
2.3 随机过程模型:基因表达的随机性
基因表达具有内在随机性,随机过程模型能更好地捕捉这种特性。
随机微分方程(SDE)模型:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
def simulate_gene_expression_sde(n_simulations=100, steps=1000, dt=0.01):
"""
模拟基因表达的随机微分方程
使用Euler-Maruyama方法
"""
# 参数
alpha = 0.5 # 合成速率
beta = 0.1 # 降解速率
sigma = 0.1 # 噪声强度
# 存储结果
trajectories = []
for sim in range(n_simulations):
x = np.zeros(steps)
x[0] = 0.1 # 初始浓度
for t in range(1, steps):
# 确定性部分
deterministic = (alpha - beta * x[t-1]) * dt
# 随机部分(维纳过程增量)
random = sigma * np.sqrt(dt) * np.random.randn()
# 更新
x[t] = x[t-1] + deterministic + random
# 确保非负
if x[t] < 0:
x[t] = 0
trajectories.append(x)
return np.array(trajectories)
# 运行模拟
trajectories = simulate_gene_expression_sde(n_simulations=50, steps=1000, dt=0.01)
# 可视化
plt.figure(figsize=(12, 8))
# 绘制多条轨迹
plt.subplot(2, 2, 1)
for i in range(min(10, len(trajectories))):
plt.plot(trajectories[i], alpha=0.7, linewidth=0.8)
plt.xlabel('时间步')
plt.ylabel('mRNA浓度')
plt.title('基因表达随机轨迹(10条)')
plt.grid(True)
# 绘制平均轨迹和置信区间
plt.subplot(2, 2, 2)
mean_traj = np.mean(trajectories, axis=0)
std_traj = np.std(trajectories, axis=0)
plt.plot(mean_traj, 'b-', linewidth=2, label='均值')
plt.fill_between(range(len(mean_traj)),
mean_traj - 1.96*std_traj,
mean_traj + 1.96*std_traj,
alpha=0.3, label='95%置信区间')
plt.xlabel('时间步')
plt.ylabel('mRNA浓度')
plt.title('平均轨迹与置信区间')
plt.legend()
plt.grid(True)
# 绘制稳态分布
plt.subplot(2, 2, 3)
steady_state = trajectories[:, -1] # 最后时刻的浓度
plt.hist(steady_state, bins=20, density=True, alpha=0.7, edgecolor='black')
plt.xlabel('mRNA浓度')
plt.ylabel('概率密度')
plt.title('稳态分布')
# 理论分布(近似)
x_range = np.linspace(0, max(steady_state), 100)
# 理论稳态分布(近似为Gamma分布)
alpha_param = 0.5 / (0.1**2) # 形状参数
beta_param = 0.1 / 0.1**2 # 尺度参数
from scipy.stats import gamma
pdf = gamma.pdf(x_range, a=alpha_param, scale=1/beta_param)
plt.plot(x_range, pdf, 'r-', linewidth=2, label='理论分布')
plt.legend()
# 噪声强度分析
plt.subplot(2, 2, 4)
sigma_values = np.linspace(0.01, 0.5, 10)
cv_values = []
for sigma in sigma_values:
traj = simulate_gene_expression_sde(n_simulations=20, steps=500, dt=0.01)
cv = np.std(traj[:, -1]) / np.mean(traj[:, -1])
cv_values.append(cv)
plt.plot(sigma_values, cv_values, 'go-', linewidth=2)
plt.xlabel('噪声强度 (σ)')
plt.ylabel('变异系数 (CV)')
plt.title('噪声对表达变异的影响')
plt.grid(True)
plt.tight_layout()
plt.show()
第三部分:跨学科挑战与解决方案
3.1 数据挑战:高维性与噪声
生物学数据通常具有高维、稀疏和噪声大的特点,这对数学建模提出了挑战。
解决方案示例:降维与特征选择
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
import pandas as pd
def preprocess_biological_data(data, labels=None, n_components=50):
"""
生物学数据预处理:降维与特征选择
"""
# 1. 数据标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
# 2. 特征选择(如果有标签)
if labels is not None:
selector = SelectKBest(score_func=f_classif, k=min(100, data.shape[1]))
data_selected = selector.fit_transform(data_scaled, labels)
selected_features = selector.get_support(indices=True)
else:
data_selected = data_scaled
selected_features = None
# 3. PCA降维
pca = PCA(n_components=min(n_components, data_selected.shape[1]))
data_pca = pca.fit_transform(data_selected)
# 4. 解释方差
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)
return {
'data_pca': data_pca,
'explained_variance': explained_variance,
'cumulative_variance': cumulative_variance,
'selected_features': selected_features,
'pca_model': pca
}
# 示例:基因表达数据降维
np.random.seed(42)
n_samples = 100
n_features = 1000
n_informative = 50
# 生成模拟基因表达数据
X = np.random.randn(n_samples, n_features)
# 添加一些有信息的特征
for i in range(n_informative):
X[:, i] += 2 * (np.random.randn(n_samples) > 0)
# 生成标签(疾病状态)
y = (X[:, :5].sum(axis=1) > 0).astype(int)
# 预处理
result = preprocess_biological_data(X, y, n_components=20)
# 可视化
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(result['cumulative_variance'], 'bo-')
plt.xlabel('主成分数量')
plt.ylabel('累计解释方差')
plt.title('PCA降维效果')
plt.grid(True)
plt.subplot(1, 2, 2)
plt.scatter(result['data_pca'][:, 0], result['data_pca'][:, 1], c=y, cmap='viridis', alpha=0.7)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('降维后的数据分布')
plt.colorbar(label='类别')
plt.tight_layout()
plt.show()
3.2 模型复杂性与可解释性平衡
复杂的数学模型可能过拟合,而简单的模型可能无法捕捉生物学复杂性。
解决方案:模型选择与验证
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')
def compare_models(X, y, test_size=0.3):
"""
比较不同复杂度的模型
"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
models = {
'Logistic Regression': LogisticRegression(max_iter=1000),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
'SVM': SVC(kernel='rbf', probability=True)
}
results = {}
for name, model in models.items():
# 交叉验证
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
# 训练并测试
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
results[name] = {
'cv_mean': cv_scores.mean(),
'cv_std': cv_scores.std(),
'train_score': train_score,
'test_score': test_score
}
print(f"{name}:")
print(f" CV Accuracy: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")
print(f" Train Accuracy: {train_score:.3f}")
print(f" Test Accuracy: {test_score:.3f}")
print()
return results
# 使用之前生成的数据
results = compare_models(result['data_pca'], y)
# 可视化比较
plt.figure(figsize=(10, 6))
model_names = list(results.keys())
cv_scores = [results[name]['cv_mean'] for name in model_names]
test_scores = [results[name]['test_score'] for name in model_names]
x = np.arange(len(model_names))
width = 0.35
plt.bar(x - width/2, cv_scores, width, label='交叉验证准确率', alpha=0.8)
plt.bar(x + width/2, test_scores, width, label='测试集准确率', alpha=0.8)
plt.xlabel('模型')
plt.ylabel('准确率')
plt.title('不同复杂度模型的性能比较')
plt.xticks(x, model_names)
plt.legend()
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()
3.3 跨学科沟通与协作
数学建模者与生物学家之间的沟通障碍是常见挑战。
解决方案:可视化与交互工具
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
def create_interactive_visualization(data, model_results):
"""
创建交互式可视化,促进跨学科沟通
"""
# 创建子图
fig = make_subplots(
rows=2, cols=2,
subplot_titles=('模型预测 vs 实际数据', '参数敏感性分析',
'网络拓扑结构', '不确定性量化'),
specs=[[{'type': 'scatter'}, {'type': 'scatter'}],
[{'type': 'scatter'}, {'type': 'scatter'}]]
)
# 1. 模型预测 vs 实际数据
fig.add_trace(
go.Scatter(x=data['time'], y=data['actual'],
mode='lines+markers', name='实际数据',
line=dict(color='blue', width=2)),
row=1, col=1
)
fig.add_trace(
go.Scatter(x=data['time'], y=data['predicted'],
mode='lines', name='模型预测',
line=dict(color='red', width=2, dash='dash')),
row=1, col=1
)
# 2. 参数敏感性
fig.add_trace(
go.Scatter(x=model_results['param_values'],
y=model_results['sensitivity'],
mode='lines+markers', name='参数敏感性',
line=dict(color='green', width=2)),
row=1, col=2
)
# 3. 网络拓扑(示例)
# 这里使用模拟数据
n_nodes = 20
fig.add_trace(
go.Scatter(x=np.random.randn(n_nodes),
y=np.random.randn(n_nodes),
mode='markers', name='网络节点',
marker=dict(size=10, color=np.random.rand(n_nodes),
colorscale='Viridis', showscale=True)),
row=2, col=1
)
# 4. 不确定性量化
fig.add_trace(
go.Scatter(x=model_results['uncertainty']['time'],
y=model_results['uncertainty']['mean'],
mode='lines', name='均值',
line=dict(color='purple', width=2)),
row=2, col=2
)
fig.add_trace(
go.Scatter(x=model_results['uncertainty']['time'],
y=model_results['uncertainty']['upper'],
mode='lines', name='95%上限',
line=dict(color='purple', width=1, dash='dot')),
row=2, col=2
)
fig.add_trace(
go.Scatter(x=model_results['uncertainty']['time'],
y=model_results['uncertainty']['lower'],
mode='lines', name='95%下限',
line=dict(color='purple', width=1, dash='dot'),
fill='tonexty', fillcolor='rgba(128,0,128,0.1)'),
row=2, col=2
)
fig.update_layout(height=800, showlegend=True,
title_text="数学建模结果交互式可视化")
return fig
# 生成模拟数据
time = np.linspace(0, 10, 100)
actual = 2 * np.sin(time) + np.random.randn(100) * 0.2
predicted = 2 * np.sin(time) + 0.1 * time
data = {'time': time, 'actual': actual, 'predicted': predicted}
# 模拟模型结果
model_results = {
'param_values': np.linspace(0.1, 2, 20),
'sensitivity': np.random.rand(20) * 2 + 0.5,
'uncertainty': {
'time': time,
'mean': predicted,
'upper': predicted + 0.3,
'lower': predicted - 0.3
}
}
# 创建交互式图表
fig = create_interactive_visualization(data, model_results)
fig.show()
第四部分:未来发展方向与新兴技术
4.1 人工智能与机器学习的融合
深度学习正在改变数学建模在生物学中的应用方式。
案例:使用深度学习预测基因表达
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
class GeneExpressionPredictor(nn.Module):
"""基于深度学习的基因表达预测模型"""
def __init__(self, input_dim=100, hidden_dim=64, output_dim=10):
super(GeneExpressionPredictor, self).__init__()
self.encoder = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(hidden_dim, hidden_dim // 2),
nn.ReLU(),
nn.Dropout(0.2)
)
self.decoder = nn.Sequential(
nn.Linear(hidden_dim // 2, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, output_dim),
nn.Sigmoid() # 输出在0-1之间
)
def forward(self, x):
encoded = self.encoder(x)
decoded = self.decoder(encoded)
return decoded
def train_gene_expression_model():
"""训练基因表达预测模型"""
# 生成模拟数据
n_samples = 1000
input_dim = 100
output_dim = 10
# 输入:转录因子表达
X = torch.randn(n_samples, input_dim)
# 输出:目标基因表达(非线性关系)
# 模拟复杂的调控关系
with torch.no_grad():
true_weights = torch.randn(input_dim, output_dim) * 0.1
true_bias = torch.randn(output_dim) * 0.1
Y = torch.sigmoid(X @ true_weights + true_bias + 0.1 * torch.randn(n_samples, output_dim))
# 创建数据集
dataset = TensorDataset(X, Y)
train_size = int(0.8 * n_samples)
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, n_samples - train_size])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# 模型和优化器
model = GeneExpressionPredictor(input_dim, 64, output_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练循环
train_losses = []
test_losses = []
for epoch in range(100):
model.train()
epoch_train_loss = 0
for batch_X, batch_Y in train_loader:
optimizer.zero_grad()
outputs = model(batch_X)
loss = criterion(outputs, batch_Y)
loss.backward()
optimizer.step()
epoch_train_loss += loss.item()
# 测试
model.eval()
epoch_test_loss = 0
with torch.no_grad():
for batch_X, batch_Y in test_loader:
outputs = model(batch_X)
loss = criterion(outputs, batch_Y)
epoch_test_loss += loss.item()
train_losses.append(epoch_train_loss / len(train_loader))
test_losses.append(epoch_test_loss / len(test_loader))
if epoch % 20 == 0:
print(f"Epoch {epoch}: Train Loss = {train_losses[-1]:.4f}, Test Loss = {test_losses[-1]:.4f}")
# 可视化训练过程
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='训练损失', linewidth=2)
plt.plot(test_losses, label='测试损失', linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('损失')
plt.title('深度学习模型训练过程')
plt.legend()
plt.grid(True)
plt.show()
return model, train_losses, test_losses
# 运行训练
model, train_losses, test_losses = train_gene_expression_model()
4.2 多组学数据整合
整合基因组、转录组、蛋白质组等多组学数据是当前热点。
多组学整合分析框架:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.manifold import TSNE
import umap
class MultiOmicsIntegration:
"""多组学数据整合分析"""
def __init__(self):
self.data = {}
self.integrated = None
def add_omics_data(self, name, data, metadata=None):
"""添加组学数据"""
self.data[name] = {
'data': data,
'metadata': metadata
}
def integrate_nmf(self, n_components=10):
"""使用非负矩阵分解整合多组学数据"""
# 对齐数据(简化处理)
aligned_data = []
for name in self.data.keys():
# 标准化
data = self.data[name]['data']
if isinstance(data, pd.DataFrame):
data = data.values
data_norm = (data - data.mean(axis=0)) / (data.std(axis=0) + 1e-8)
aligned_data.append(data_norm)
# 拼接数据
combined = np.hstack(aligned_data)
# NMF分解
nmf = NMF(n_components=n_components, random_state=42)
W = nmf.fit_transform(combined)
H = nmf.components_
self.integrated = {
'W': W, # 样本在潜在空间的表示
'H': H, # 特征在潜在空间的表示
'n_components': n_components
}
return W, H
def visualize_integration(self):
"""可视化整合结果"""
if self.integrated is None:
print("请先运行整合方法")
return
W = self.integrated['W']
# t-SNE降维
tsne = TSNE(n_components=2, random_state=42)
W_tsne = tsne.fit_transform(W)
# UMAP降维
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
W_umap = reducer.fit_transform(W)
# 可视化
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# 原始潜在空间
axes[0].scatter(W[:, 0], W[:, 1], alpha=0.6)
axes[0].set_xlabel('潜在维度1')
axes[0].set_ylabel('潜在维度2')
axes[0].set_title('NMF潜在空间')
axes[0].grid(True)
# t-SNE
axes[1].scatter(W_tsne[:, 0], W_tsne[:, 1], alpha=0.6, c='orange')
axes[1].set_xlabel('t-SNE 1')
axes[1].set_ylabel('t-SNE 2')
axes[1].set_title('t-SNE可视化')
axes[1].grid(True)
# UMAP
axes[2].scatter(W_umap[:, 0], W_umap[:, 1], alpha=0.6, c='green')
axes[2].set_xlabel('UMAP 1')
axes[2].set_ylabel('UMAP 2')
axes[2].set_title('UMAP可视化')
axes[2].grid(True)
plt.tight_layout()
plt.show()
# 示例:模拟多组学数据
np.random.seed(42)
n_samples = 100
# 模拟基因组数据(SNP)
genomic_data = np.random.randint(0, 2, size=(n_samples, 50))
# 模拟转录组数据(基因表达)
transcriptomic_data = np.random.randn(n_samples, 100)
# 模拟蛋白质组数据
proteomic_data = np.random.randn(n_samples, 80)
# 创建整合分析对象
integration = MultiOmicsIntegration()
integration.add_omics_data('genomic', genomic_data)
integration.add_omics_data('transcriptomic', transcriptomic_data)
integration.add_omics_data('proteomic', proteomic_data)
# 运行整合
W, H = integration.integrate_nmf(n_components=5)
print(f"整合后样本表示维度: {W.shape}")
print(f"整合后特征表示维度: {H.shape}")
# 可视化
integration.visualize_integration()
4.3 云计算与高性能计算
大规模生物数学建模需要强大的计算资源。
案例:使用Dask进行并行计算
import dask.array as da
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
import time
def parallel_biological_simulation(n_simulations=10000, n_steps=1000):
"""
使用Dask进行并行生物模拟
"""
# 启动本地集群
cluster = LocalCluster(n_workers=4, threads_per_worker=2, memory_limit='4GB')
client = Client(cluster)
print(f"集群信息: {client}")
# 创建Dask数组
# 模拟多个独立的生物过程
data = da.random.random((n_simulations, n_steps), chunks=(1000, 1000))
# 定义模拟函数
def simulate_single_trajectory(traj):
"""模拟单个轨迹"""
# 简化的生物过程模拟
result = np.zeros_like(traj)
result[0] = 0.1
for t in range(1, len(traj)):
# 简单的随机过程
result[t] = result[t-1] + 0.01 * (1 - result[t-1]) + 0.001 * np.random.randn()
return result
# 并行应用函数
results = data.map_blocks(simulate_single_trajectory,
dtype=float,
chunks=(1000, 1000))
# 计算统计量
mean_trajectory = results.mean(axis=0)
std_trajectory = results.std(axis=0)
# 执行计算
start_time = time.time()
mean_result = mean_trajectory.compute()
std_result = std_trajectory.compute()
end_time = time.time()
print(f"并行计算时间: {end_time - start_time:.2f}秒")
# 可视化
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(mean_result, 'b-', linewidth=2, label='均值')
plt.fill_between(range(len(mean_result)),
mean_result - 1.96*std_result,
mean_result + 1.96*std_result,
alpha=0.3, label='95%置信区间')
plt.xlabel('时间步')
plt.ylabel('模拟值')
plt.title('并行模拟结果')
plt.legend()
plt.grid(True)
plt.subplot(1, 2, 2)
plt.hist(results[:, -1].compute(), bins=30, density=True, alpha=0.7, edgecolor='black')
plt.xlabel('最终值')
plt.ylabel('概率密度')
plt.title('最终状态分布')
plt.grid(True)
plt.tight_layout()
plt.show()
# 关闭集群
client.close()
cluster.close()
return mean_result, std_result
# 运行并行模拟
mean_result, std_result = parallel_biological_simulation(n_simulations=5000, n_steps=500)
第五部分:实际应用案例研究
5.1 COVID-19疫情预测与防控策略优化
案例:基于多源数据的疫情预测模型
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt
class COVID19PredictionModel:
"""COVID-19疫情预测模型"""
def __init__(self):
self.model = GradientBoostingRegressor(n_estimators=100, random_state=42)
self.features = None
self.target = None
def prepare_data(self, data_path=None):
"""准备数据(模拟数据)"""
if data_path is None:
# 生成模拟数据
np.random.seed(42)
n_days = 200
# 时间特征
time = np.arange(n_days)
# 模拟病例数(包含季节性和趋势)
trend = 0.1 * time
seasonality = 10 * np.sin(2 * np.pi * time / 30)
noise = np.random.randn(n_days) * 5
cases = 50 + trend + seasonality + noise
# 模拟其他特征
mobility = 100 - 0.5 * time + np.random.randn(n_days) * 2
testing_rate = 0.1 + 0.001 * time + np.random.randn(n_days) * 0.01
vaccination_rate = np.maximum(0, 0.002 * (time - 50) + np.random.randn(n_days) * 0.001)
# 创建DataFrame
df = pd.DataFrame({
'day': time,
'cases': cases,
'mobility': mobility,
'testing_rate': testing_rate,
'vaccination_rate': vaccination_rate,
'day_of_week': time % 7
})
# 添加滞后特征
for lag in [1, 3, 7, 14]:
df[f'cases_lag_{lag}'] = df['cases'].shift(lag)
# 添加移动平均
df['cases_ma_7'] = df['cases'].rolling(7).mean()
df['cases_ma_14'] = df['cases'].rolling(14).mean()
# 添加增长率
df['growth_rate'] = df['cases'].pct_change()
# 清理NaN值
df = df.dropna()
return df
def train_model(self, df):
"""训练预测模型"""
# 特征和目标
feature_cols = [col for col in df.columns if col not in ['day', 'cases']]
self.features = df[feature_cols]
self.target = df['cases']
# 时间序列交叉验证
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = []
for train_idx, val_idx in tscv.split(self.features):
X_train, X_val = self.features.iloc[train_idx], self.features.iloc[val_idx]
y_train, y_val = self.target.iloc[train_idx], self.target.iloc[val_idx]
self.model.fit(X_train, y_train)
score = self.model.score(X_val, y_val)
cv_scores.append(score)
print(f"交叉验证R²分数: {cv_scores}")
print(f"平均R²分数: {np.mean(cv_scores):.3f}")
# 特征重要性
importance = pd.DataFrame({
'feature': feature_cols,
'importance': self.model.feature_importances_
}).sort_values('importance', ascending=False)
print("\n特征重要性(前10):")
print(importance.head(10))
return importance
def predict_future(self, df, future_days=30):
"""预测未来"""
# 使用最后已知数据作为起点
last_row = df.iloc[-1:].copy()
predictions = []
current_data = last_row.copy()
for day in range(1, future_days + 1):
# 预测
pred = self.model.predict(current_data[self.features.columns])[0]
predictions.append(pred)
# 更新特征(简化处理)
new_row = current_data.copy()
new_row['day'] += 1
new_row['cases'] = pred
# 更新滞后特征
for lag in [1, 3, 7, 14]:
if lag == 1:
new_row[f'cases_lag_{lag}'] = current_data['cases'].values[0]
else:
new_row[f'cases_lag_{lag}'] = current_data[f'cases_lag_{lag-1}'].values[0]
# 更新移动平均
new_row['cases_ma_7'] = (current_data['cases'].values[0] +
current_data['cases_lag_1'].values[0] +
current_data['cases_lag_2'].values[0] +
current_data['cases_lag_3'].values[0] +
current_data['cases_lag_4'].values[0] +
current_data['cases_lag_5'].values[0] +
current_data['cases_lag_6'].values[0]) / 7
# 更新增长率
new_row['growth_rate'] = (pred - current_data['cases'].values[0]) / current_data['cases'].values[0]
current_data = new_row
return predictions
# 运行分析
model = COVID19PredictionModel()
df = model.prepare_data()
importance = model.train_model(df)
predictions = model.predict_future(df, future_days=30)
# 可视化
plt.figure(figsize=(12, 6))
# 历史数据
plt.plot(df['day'], df['cases'], 'b-', label='历史病例', linewidth=2)
# 预测
future_days = np.arange(df['day'].max() + 1, df['day'].max() + 31)
plt.plot(future_days, predictions, 'r--', label='预测病例', linewidth=2)
plt.xlabel('天数')
plt.ylabel('病例数')
plt.title('COVID-19病例预测')
plt.legend()
plt.grid(True)
plt.show()
5.2 癌症基因调控网络分析
案例:使用微分方程模型分析癌症相关基因网络
import numpy as np
from scipy.integrate import solve_ivp
import matplotlib.pyplot as plt
class CancerGeneNetwork:
"""癌症相关基因调控网络模型"""
def __init__(self):
self.genes = ['p53', 'MDM2', 'p21', 'CyclinD1', 'CDK4', 'E2F', 'Apoptosis']
self.n_genes = len(self.genes)
def cancer_ode_model(self, t, y, params):
"""
癌症相关基因调控ODE模型
y: 各基因的表达水平
"""
p53, MDM2, p21, CyclinD1, CDK4, E2F, Apoptosis = y
# 参数(示例值,实际需要实验数据校准)
alpha_p53, beta_p53, delta_p53 = params[0:3] # p53参数
alpha_MDM2, beta_MDM2, delta_MDM2 = params[3:6] # MDM2参数
alpha_p21, delta_p21 = params[6:8] # p21参数
alpha_CyclinD1, delta_CyclinD1 = params[8:10] # CyclinD1参数
alpha_CDK4, delta_CDK4 = params[10:12] # CDK4参数
alpha_E2F, delta_E2F = params[12:14] # E2F参数
alpha_Apoptosis, delta_Apoptosis = params[14:16] # Apoptosis参数
# p53动力学(受MDM2负调控)
dp53_dt = alpha_p53 / (1 + MDM2) - delta_p53 * p53
# MDM2动力学(受p53正调控)
dMDM2_dt = alpha_MDM2 * p53 - delta_MDM2 * MDM2
# p21动力学(受p53正调控)
dp21_dt = alpha_p21 * p53 - delta_p21 * p21
# CyclinD1动力学(受E2F正调控,受p21负调控)
dCyclinD1_dt = alpha_CyclinD1 * E2F / (1 + p21) - delta_CyclinD1 * CyclinD1
# CDK4动力学(受CyclinD1正调控)
dCDK4_dt = alpha_CDK4 * CyclinD1 - delta_CDK4 * CDK4
# E2F动力学(受CDK4正调控,受p21负调控)
dE2F_dt = alpha_E2F * CDK4 / (1 + p21) - delta_E2F * E2F
# Apoptosis动力学(受p53正调控,受E2F负调控)
dApoptosis_dt = alpha_Apoptosis * p53 / (1 + E2F) - delta_Apoptosis * Apoptosis
return [dp53_dt, dMDM2_dt, dp21_dt, dCyclinD1_dt, dCDK4_dt, dE2F_dt, dApoptosis_dt]
def simulate_cancer_dynamics(self, params, initial_conditions, t_span=(0, 100)):
"""模拟癌症基因动态"""
sol = solve_ivp(self.cancer_ode_model, t_span, initial_conditions,
args=(params,), dense_output=True)
return sol
def analyze_cancer_states(self, sol, t_eval):
"""分析癌症状态"""
# 计算不同时间点的基因表达
y = sol.sol(t_eval)
# 定义癌症状态指标
# 1. 增殖指数:CyclinD1 * CDK4 * E2F
proliferation_index = y[3] * y[4] * y[5]
# 2. 凋亡指数:Apoptosis / (E2F + 0.1)
apoptosis_index = y[6] / (y[5] + 0.1)
# 3. 抑癌指数:p53 * p21
tumor_suppressor_index = y[0] * y[2]
return {
'time': t_eval,
'proliferation': proliferation_index,
'apoptosis': apoptosis_index,
'tumor_suppressor': tumor_suppressor_index,
'raw_data': y
}
# 运行模拟
cancer_model = CancerGeneNetwork()
# 参数设置(正常细胞 vs 癌细胞)
# 正常细胞参数
normal_params = [
0.5, 0.1, 0.2, # p53
0.3, 0.1, 0.2, # MDM2
0.4, 0.15, # p21
0.2, 0.1, # CyclinD1
0.3, 0.1, # CDK4
0.2, 0.1, # E2F
0.1, 0.05 # Apoptosis
]
# 癌细胞参数(p53突变,增殖增强)
cancer_params = [
0.1, 0.1, 0.2, # p53活性降低
0.3, 0.1, 0.2, # MDM2
0.1, 0.15, # p21降低
0.6, 0.1, # CyclinD1增强
0.5, 0.1, # CDK4增强
0.6, 0.1, # E2F增强
0.05, 0.05 # Apoptosis降低
]
# 初始条件
initial_conditions = [0.5, 0.3, 0.4, 0.2, 0.3, 0.2, 0.1]
# 时间点
t_eval = np.linspace(0, 100, 1000)
# 模拟正常细胞
sol_normal = cancer_model.simulate_cancer_dynamics(normal_params, initial_conditions)
results_normal = cancer_model.analyze_cancer_states(sol_normal, t_eval)
# 模拟癌细胞
sol_cancer = cancer_model.simulate_cancer_dynamics(cancer_params, initial_conditions)
results_cancer = cancer_model.analyze_cancer_states(sol_cancer, t_eval)
# 可视化
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
# 正常细胞动态
axes[0, 0].plot(results_normal['time'], results_normal['raw_data'][0], label='p53')
axes[0, 0].plot(results_normal['time'], results_normal['raw_data'][2], label='p21')
axes[0, 0].set_title('正常细胞:抑癌基因')
axes[0, 0].legend()
axes[0, 0].grid(True)
axes[0, 1].plot(results_normal['time'], results_normal['raw_data'][3], label='CyclinD1')
axes[0, 1].plot(results_normal['time'], results_normal['raw_data'][5], label='E2F')
axes[0, 1].set_title('正常细胞:增殖基因')
axes[0, 1].legend()
axes[0, 1].grid(True)
axes[0, 2].plot(results_normal['time'], results_normal['proliferation'], label='增殖指数')
axes[0, 2].plot(results_normal['time'], results_normal['apoptosis'], label='凋亡指数')
axes[0, 2].plot(results_normal['time'], results_normal['tumor_suppressor'], label='抑癌指数')
axes[0, 2].set_title('正常细胞:状态指数')
axes[0, 2].legend()
axes[0, 2].grid(True)
# 癌细胞动态
axes[1, 0].plot(results_cancer['time'], results_cancer['raw_data'][0], label='p53')
axes[1, 0].plot(results_cancer['time'], results_cancer['raw_data'][2], label='p21')
axes[1, 0].set_title('癌细胞:抑癌基因')
axes[1, 0].legend()
axes[1, 0].grid(True)
axes[1, 1].plot(results_cancer['time'], results_cancer['raw_data'][3], label='CyclinD1')
axes[1, 1].plot(results_cancer['time'], results_cancer['raw_data'][5], label='E2F')
axes[1, 1].set_title('癌细胞:增殖基因')
axes[1, 1].legend()
axes[1, 1].grid(True)
axes[1, 2].plot(results_cancer['time'], results_cancer['proliferation'], label='增殖指数')
axes[1, 2].plot(results_cancer['time'], results_cancer['apoptosis'], label='凋亡指数')
axes[1, 2].plot(results_cancer['time'], results_cancer['tumor_suppressor'], label='抑癌指数')
axes[1, 2].set_title('癌细胞:状态指数')
axes[1, 2].legend()
axes[1, 2].grid(True)
plt.tight_layout()
plt.show()
# 比较分析
plt.figure(figsize=(12, 6))
plt.plot(results_normal['time'], results_normal['proliferation'],
'b-', label='正常细胞增殖', linewidth=2)
plt.plot(results_cancer['time'], results_cancer['proliferation'],
'r-', label='癌细胞增殖', linewidth=2)
plt.xlabel('时间')
plt.ylabel('增殖指数')
plt.title('正常细胞 vs 癌细胞增殖动力学比较')
plt.legend()
plt.grid(True)
plt.show()
第六部分:挑战与未来展望
6.1 当前主要挑战
- 数据质量与可重复性:生物学实验数据的噪声大、批次效应明显
- 模型验证困难:缺乏金标准数据验证模型预测
- 计算复杂性:大规模模型需要巨大计算资源
- 跨学科沟通障碍:数学语言与生物学术语的差异
6.2 未来发展方向
- 可解释人工智能:开发既能准确预测又能提供生物学解释的模型
- 实时建模:结合传感器数据实现动态实时建模
- 个性化医疗:基于个体基因组数据的精准建模
- 量子计算应用:利用量子计算处理超大规模生物网络
6.3 实用建议
对于希望进入该领域的研究者:
- 基础技能:掌握微分方程、概率统计、机器学习基础
- 工具学习:熟练使用Python/R、MATLAB、专业生物信息学工具
- 跨学科合作:主动与生物学家沟通,理解实验设计和生物学意义
- 持续学习:关注领域最新进展,参加相关会议和工作坊
结论
数学建模已经成为破解生物学难题不可或缺的工具。从传染病预测到基因调控网络分析,数学模型提供了定量分析和预测的强大能力。尽管面临数据质量、模型复杂性和跨学科沟通等挑战,但随着人工智能、多组学技术和高性能计算的发展,数学建模在生物学中的应用前景将更加广阔。
未来,数学建模与生物学的深度融合将推动精准医疗、合成生物学和疾病预防等领域的突破性进展。对于研究者而言,掌握数学建模技能并理解生物学背景,将成为在这一交叉领域取得成功的关键。
本文通过详细的数学模型、代码示例和实际案例,系统介绍了数学建模在生物学中的应用。所有代码示例均可在Python环境中运行,为读者提供了实践参考。
