机器学习-K-近邻算法

hxl1988_0311

浏览: 43713 次

最近访客更多访客>>

gaoshaoye

icnd

xiyudalolang

jianwang0310

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

数据

K-邻近算法

K-近邻算法（KNN）

概念：采用测量不同的特征值之间的距离方法进行分类，找出最近的K个邻居

1.K-近邻算法的流程

收集数据

准备数据

分析数据

训练算法

测试算法

使用算法

2.实际使用

K值的选择，如果选择较小的K值，整体模型变得复杂，容易发生过拟合；且K值的增大就意味着整体的模型变得简单

下面使用java实现KNN

从文本中获取数据：

10,20-A
12,18-A
8,26-A
1,1-B
0,1-B
11,22-A
7,20-A
40,20-C
35,18-C
50,26-C
2,2-B
45,19-C
3,1-B
3,4-B
38,22-C

package com.sosop.knn;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class Data {
	public static Map<String, List<int[]>> getData() {
		Map<String, List<int[]>> map = new HashMap<String, List<int[]>>();
		try(BufferedReader reader = 
				new BufferedReader(new InputStreamReader
						(new FileInputStream("/home/sosop/mytest/KNN-data")))) {
			String line;
			while((line = reader.readLine()) != null) {
				String[] data = line.split("-");
				String[] strCoot = data[0].split(",");
				int[] coordinate = {Integer.valueOf(strCoot[0]), Integer.valueOf(strCoot[0])};
				if(map.get(data[1]) == null) {
					List<int[]> list = new ArrayList<int[]>();
					list.add(coordinate);
					map.put(data[1], list);
				} else {
					map.get(data[1]).add(coordinate);
				}
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		return map;
	}
}

package com.sosop.knn;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;

public class KNN {
	// 计算距离并排序
	public static Map<Double, List<String>> distanceSort(int[] point) {
		Map<Double, List<String>> sortMap = new TreeMap<Double, List<String>>();
		for (Entry<String, List<int[]>> entry : Data.getData().entrySet()) {
			List<int[]> list = entry.getValue();
			for (int[] data : list) {
				double d = Math.sqrt(Math.pow((data[0] - point[0]), 2)
						+ Math.pow((data[1] - point[1]), 2));
				if (sortMap.get(d) == null) {
					List<String> arrList = new ArrayList<String>();
					arrList.add(entry.getKey());
					sortMap.put(d, arrList);
				} else {
					sortMap.get(d).add(entry.getKey());
				}
			}
		}
		return sortMap;
	}

	// 找出K个距离最近的点，分析类别最多的，最后分类
	public static String classify(int k, int point[]) {
		String category = null;
		String[] labels = new String[k];
		int index = 0;
		for (Entry<Double, List<String>> entry : distanceSort(point).entrySet()) {
			for (String label : entry.getValue()) {
				labels[index] = label;
				index++;
				if (index == k)
					break;
			}
			if (index == k)
				break;
		}

		Map<String, Integer> statistic = new HashMap<String, Integer>();

		//int max = 0;
		// 找出类别
		for (String label : labels) {
			if (statistic.get(label) == null) {
				statistic.put(label, 1);
			} else {
				statistic.put(label, statistic.get(label) + 1);
			}
			//if(max < statistic.get(label)) max = statistic.get(label);
		}

		Map<String, Integer> sorted = new TreeMap<String, Integer>(
				new ValueComparator<String, Integer>(statistic));
		sorted.putAll(statistic);
		for (String label : sorted.keySet()) {
			category = label;
			break;
		}
		return category;
	}
	
	
	 public static void main(String[] args) { 
		 int[] point = {33,20};
		 System.out.println(classify(3, point));
	 }
	 

}

// 定义按值倒序排序
class ValueComparator<T, E extends Number> implements Comparator<T> {

	Map<T, E> map;

	public ValueComparator(Map<T, E> map) {
		this.map = map;
	}

	@Override
	public int compare(T o1, T o2) {
		if (map.get(o1).doubleValue() > map.get(o2).doubleValue()) {
			return 1;
		} else if (map.get(o1).doubleValue() < map.get(o2).doubleValue()) {
			return -1;
		} else {
			return 0;
		}
	}

}

分享到：

MongoDB（一）安装 | 统计学（四）

2014-05-01 20:05
浏览 507
评论(0)
分类:行业应用
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

机器学习-K-近邻算法

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

机器学习-K-近邻算法

评论

发表评论

相关推荐

关联规则

mysql sharding技术理解

推荐引擎

近似度计算－欧几里得和皮尔逊

机器学习-决策树

Hadoop MapReduce程序开发（二）

统计学（四）

Hadoop分布式配置

统计学（三）

Hadoop MapReduce程序开发（一）

Hadoop笔记 HDFS（一）

统计学（二）

统计学（一）

最近访客更多访客>>