未分类

特征选择与显示代码


import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn import tree
from sklearn.model_selection import cross_val_score

data = pd.read_csv('./dataset/sonar.all-data',header=None,sep=',')
print(data.head())
X = data.iloc[:,:-1]
y = data.iloc[:,-1:].values.flatten()

iterations = 100 # 迭代次数
pop_size = 100   # 种群大小,多少个染色体
pc = 0.25   # 交叉概率
pm = 0.01   # 变异概率

chrom_length = len(data.columns)-1    # 染色体长度
pop = []    # 种群
fitness_list = []   # 适应度
ratio_list = []     # 累计概率


# 初始化种群
def geneEncoding():
    i = 0
    while i < pop_size:
        temp = []
        has_1 = False   # 这条染色体是否有1
        for j in range(chrom_length):
            rand = random.randint(0,1)
            if rand == 1:
                has_1 = True
            temp.append(rand)
        if has_1:   # 染色体不能全0
            i += 1
            pop.append(temp)
        

# 计算适应度
def calFitness():
    fitness_list.clear()
    for i in range(pop_size):   # 计算种群中每条染色体的适应度
        X_test = X

        has_1 = False
        for j in range(chrom_length):
            if pop[i][j] == 0:
                X_test =X_test.drop(columns = j)
            else:
                has_1 = True
        X_test = X_test.values
        
        if has_1:
            clf = tree.DecisionTreeClassifier() # 决策树作为分类器
            fitness = cross_val_score(clf, X_test, y, cv=5).mean()  # 5次交叉验证
            fitness_list.append(fitness)
        else:
            fitness = 0     # 全0的适应度为0
            fitness_list.append(fitness)

# 计算适应度的总和
def sumFitness():
    total = 0
    for i in range(pop_size):
        total += fitness_list[i]
    return total

# 计算每条染色体的累计概率
def getRatio():
    ratio_list.clear()
    ratio_list.append(fitness_list[0])
    for i in range(1, pop_size):
        ratio_list.append(ratio_list[i-1] + fitness_list[i])
    ratio_list[-1] = 1

# 选择
def selection():
    global pop
    total_fitness = sumFitness()
    for i in range(pop_size):
        fitness_list[i] = fitness_list[i] / total_fitness
    getRatio()
    
    rand_ratio = [] # 随机概率
    for i in range(pop_size):
        rand_ratio.append(random.random())
    rand_ratio.sort()

    new_pop = []    # 新种群
    i = 0  # 已经处理的随机概率数
    j = 0  # 超出范围的染色体数
   
    while i < pop_size:
        if rand_ratio[i] < ratio_list[j]:   # 随机数在第j个染色体的概率范围内
            new_pop.append(pop[j])
            i += 1
        else:
            j += 1

    pop = new_pop

# 交叉
def crossover():
    for i in range(pop_size-1): # 若交叉,则染色体i与染色体i+1交叉
        if random.random() < pc:# 发生交叉
            cpoint = random.randint(0, chrom_length-1)    # 随机选择交叉点
            temp1 = []
            temp2 = []
            temp1.extend(pop[i][:cpoint])
            temp1.extend(pop[i+1][cpoint:])
            temp2.extend(pop[i+1][:cpoint])
            temp2.extend(pop[i][cpoint:])
            pop[i] = temp1
            pop[i+1] = temp2

# 变异
def mutation():
    for i in range(pop_size):
        if random.random() < pm: # 发生变异
            mpoint = random.randint(0, chrom_length-1)  # 随机选择变异点
            if pop[i][mpoint] == 1:
                pop[i][mpoint] = 0
            else:
                pop[i][mpoint] = 1

# 最优解
def getBest():
    best_chrom = pop[0]
    best_fitness = fitness_list[0]
    for i in range(1,pop_size):
        if fitness_list[i] > best_fitness:
            best_fitness = fitness_list[i]  # 最佳适应值
            best_chrom = pop[i] # 最佳染色体

    return best_chrom, best_fitness

if __name__=='__main__':

    px = []
    py = []
    plt.ion()

    results = []
    geneEncoding() # 初始化种群
    for i in range(iterations):
        print(i)

        calFitness() # 计算种群中每条染色体适应度

        best_chrom, best_fitness = getBest()
        results.append([i, best_chrom, best_fitness])

        selection() # 选择
        crossover() # 交叉
        mutation()  # 变异
        plt.title('GA feature selection')
        plt.xlabel('iterations')
        plt.ylabel('best fitness')
        plt.xlim((0, iterations))  # x坐标范围
        plt.ylim((0.6, 0.8))  # y坐标范围

        print([i, best_chrom, best_fitness])

        px.append(i)    # 画图
        py.append(best_fitness)
        plt.plot(px,py)
        plt.show()
        plt.pause(0.001)

利用遗传算法做特征选择,并绘图。

下面是利用t-SNE算法进行特征降维并展示:

import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# 加载数据集(这里以sonar.all-data为例,但请确保路径正确)
data = pd.read_csv('./dataset/sonar.all-data', header=None, sep=',')
# 给定的数组,这是遗传算法做特征选择的结果
arr = [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1]
# 使用列表推导式找出所有标签为1的索引
indices_of_ones = [i for i, value in enumerate(arr) if value == 1]
# 输出结果
print(indices_of_ones)
# 假设最后一列是标签,其余是特征
X = data.iloc[:, indices_of_ones]  # 特征数据
y = data.iloc[:, -1]  # 标签数据(注意这里我们不需要flatten,因为y已经是Series)

# 初始化t-SNE模型
# 你可以调整perplexity和n_iter等参数以优化结果
# 注意:由于n_iter在scikit-learn的较新版本中已被弃用并替换为max_iter,这里我们使用max_iter
tsne = TSNE(n_components=3, perplexity=30, max_iter=300, random_state=42)

# 执行t-SNE降维
X_tsne = tsne.fit_transform(X)

# 可视化结果
# 这里我们使用散点图来可视化不同类别的数据点
plt.figure(figsize=(8, 6))
label_to_color = {'R': 'red', 'M': 'blue'}
for label in y.unique():
    # 筛选出属于当前类别的数据点
    points = X_tsne[y == label]
    # 绘制散点图
    plt.scatter(points[:, 0], points[:, 1], color=label_to_color[label], label=label)

plt.legend()
plt.title('t-SNE Visualization of Sonar Dataset')
plt.xlabel('t-SNE Feature 1')
plt.ylabel('t-SNE Feature 2')
plt.show()

下面是绘制混淆矩阵:


from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# # 假设这是你的模型预测结果
# y_pred = [2, 0, 2, 2, 0, 1, 1, 2]
# # 假设这是你的真实结果
# y_true = [2, 0, 2, 2, 0, 1, 0, 2]

# 计算混淆矩阵
cm = confusion_matrix(y_true, y_pred)
print(cm)

# 绘制混淆矩阵
classes = ['other', 'frogman', 'UUV']
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=classes, yticklabels=classes, annot_kws={"size": 16})
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('Confusion Matrix')
# plt.title('轨迹特征混淆矩阵')
plt.show()