特征选择与显示代码
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn import tree
from sklearn.model_selection import cross_val_score
data = pd.read_csv('./dataset/sonar.all-data',header=None,sep=',')
print(data.head())
X = data.iloc[:,:-1]
y = data.iloc[:,-1:].values.flatten()
iterations = 100 # 迭代次数
pop_size = 100 # 种群大小,多少个染色体
pc = 0.25 # 交叉概率
pm = 0.01 # 变异概率
chrom_length = len(data.columns)-1 # 染色体长度
pop = [] # 种群
fitness_list = [] # 适应度
ratio_list = [] # 累计概率
# 初始化种群
def geneEncoding():
i = 0
while i < pop_size:
temp = []
has_1 = False # 这条染色体是否有1
for j in range(chrom_length):
rand = random.randint(0,1)
if rand == 1:
has_1 = True
temp.append(rand)
if has_1: # 染色体不能全0
i += 1
pop.append(temp)
# 计算适应度
def calFitness():
fitness_list.clear()
for i in range(pop_size): # 计算种群中每条染色体的适应度
X_test = X
has_1 = False
for j in range(chrom_length):
if pop[i][j] == 0:
X_test =X_test.drop(columns = j)
else:
has_1 = True
X_test = X_test.values
if has_1:
clf = tree.DecisionTreeClassifier() # 决策树作为分类器
fitness = cross_val_score(clf, X_test, y, cv=5).mean() # 5次交叉验证
fitness_list.append(fitness)
else:
fitness = 0 # 全0的适应度为0
fitness_list.append(fitness)
# 计算适应度的总和
def sumFitness():
total = 0
for i in range(pop_size):
total += fitness_list[i]
return total
# 计算每条染色体的累计概率
def getRatio():
ratio_list.clear()
ratio_list.append(fitness_list[0])
for i in range(1, pop_size):
ratio_list.append(ratio_list[i-1] + fitness_list[i])
ratio_list[-1] = 1
# 选择
def selection():
global pop
total_fitness = sumFitness()
for i in range(pop_size):
fitness_list[i] = fitness_list[i] / total_fitness
getRatio()
rand_ratio = [] # 随机概率
for i in range(pop_size):
rand_ratio.append(random.random())
rand_ratio.sort()
new_pop = [] # 新种群
i = 0 # 已经处理的随机概率数
j = 0 # 超出范围的染色体数
while i < pop_size:
if rand_ratio[i] < ratio_list[j]: # 随机数在第j个染色体的概率范围内
new_pop.append(pop[j])
i += 1
else:
j += 1
pop = new_pop
# 交叉
def crossover():
for i in range(pop_size-1): # 若交叉,则染色体i与染色体i+1交叉
if random.random() < pc:# 发生交叉
cpoint = random.randint(0, chrom_length-1) # 随机选择交叉点
temp1 = []
temp2 = []
temp1.extend(pop[i][:cpoint])
temp1.extend(pop[i+1][cpoint:])
temp2.extend(pop[i+1][:cpoint])
temp2.extend(pop[i][cpoint:])
pop[i] = temp1
pop[i+1] = temp2
# 变异
def mutation():
for i in range(pop_size):
if random.random() < pm: # 发生变异
mpoint = random.randint(0, chrom_length-1) # 随机选择变异点
if pop[i][mpoint] == 1:
pop[i][mpoint] = 0
else:
pop[i][mpoint] = 1
# 最优解
def getBest():
best_chrom = pop[0]
best_fitness = fitness_list[0]
for i in range(1,pop_size):
if fitness_list[i] > best_fitness:
best_fitness = fitness_list[i] # 最佳适应值
best_chrom = pop[i] # 最佳染色体
return best_chrom, best_fitness
if __name__=='__main__':
px = []
py = []
plt.ion()
results = []
geneEncoding() # 初始化种群
for i in range(iterations):
print(i)
calFitness() # 计算种群中每条染色体适应度
best_chrom, best_fitness = getBest()
results.append([i, best_chrom, best_fitness])
selection() # 选择
crossover() # 交叉
mutation() # 变异
plt.title('GA feature selection')
plt.xlabel('iterations')
plt.ylabel('best fitness')
plt.xlim((0, iterations)) # x坐标范围
plt.ylim((0.6, 0.8)) # y坐标范围
print([i, best_chrom, best_fitness])
px.append(i) # 画图
py.append(best_fitness)
plt.plot(px,py)
plt.show()
plt.pause(0.001)
利用遗传算法做特征选择,并绘图。
下面是利用t-SNE算法进行特征降维并展示:
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# 加载数据集(这里以sonar.all-data为例,但请确保路径正确)
data = pd.read_csv('./dataset/sonar.all-data', header=None, sep=',')
# 给定的数组,这是遗传算法做特征选择的结果
arr = [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1]
# 使用列表推导式找出所有标签为1的索引
indices_of_ones = [i for i, value in enumerate(arr) if value == 1]
# 输出结果
print(indices_of_ones)
# 假设最后一列是标签,其余是特征
X = data.iloc[:, indices_of_ones] # 特征数据
y = data.iloc[:, -1] # 标签数据(注意这里我们不需要flatten,因为y已经是Series)
# 初始化t-SNE模型
# 你可以调整perplexity和n_iter等参数以优化结果
# 注意:由于n_iter在scikit-learn的较新版本中已被弃用并替换为max_iter,这里我们使用max_iter
tsne = TSNE(n_components=3, perplexity=30, max_iter=300, random_state=42)
# 执行t-SNE降维
X_tsne = tsne.fit_transform(X)
# 可视化结果
# 这里我们使用散点图来可视化不同类别的数据点
plt.figure(figsize=(8, 6))
label_to_color = {'R': 'red', 'M': 'blue'}
for label in y.unique():
# 筛选出属于当前类别的数据点
points = X_tsne[y == label]
# 绘制散点图
plt.scatter(points[:, 0], points[:, 1], color=label_to_color[label], label=label)
plt.legend()
plt.title('t-SNE Visualization of Sonar Dataset')
plt.xlabel('t-SNE Feature 1')
plt.ylabel('t-SNE Feature 2')
plt.show()
下面是绘制混淆矩阵:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# # 假设这是你的模型预测结果
# y_pred = [2, 0, 2, 2, 0, 1, 1, 2]
# # 假设这是你的真实结果
# y_true = [2, 0, 2, 2, 0, 1, 0, 2]
# 计算混淆矩阵
cm = confusion_matrix(y_true, y_pred)
print(cm)
# 绘制混淆矩阵
classes = ['other', 'frogman', 'UUV']
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=classes, yticklabels=classes, annot_kws={"size": 16})
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('Confusion Matrix')
# plt.title('轨迹特征混淆矩阵')
plt.show()