概述
概述:
从所周知,当需求中数据出现分布不均的情况时,按照hadoop mr任务的默认partition方法,会出现某些机子负载过重的情况,这样会拖慢整个任务进度,有时候甚至会宕机。在这里我介绍一个用随机数解决这个问题的方案,下面是通过代码来模拟map根据partition来分区的情况;
原理(追加)
方案就是运用了数学中均匀分布的原理,举个例子:10个小球,有5个盒子,那么每个小球分配到每个盒子的概率都是相同的,即五分之一,那么例子中小球模拟的是mr中的key,盒子模拟的是reducer。这种方案相对于官方提供的用一致性哈希算法有什么优点呢,我们了解到要使用一致性哈希算法达到均匀分布key,哈希算法中的虚拟桶数在足够多,但是hadoop处理的数据都是十亿级数据,key种类也是千万级以上,如果为了更均匀,必然会耗费更多的内存,但是如果采用这种方案,那么几乎是不需要内存来保存key的状态,而且数据量越大,分布就更加均匀
测试代码
package com.mxq.balance;
import java.util.Vector;
public class UNBalanceData {
private Vector<keyAndNum> keyVector=new Vector<keyAndNum>(10);
//这里用于生成不均匀数据样本的信息;
public void intialize(){
keyAndNum kn=new keyAndNum(1,50);
this.keyVector.add(kn);
kn=new keyAndNum(2,100);
this.keyVector.add(kn);
kn=new keyAndNum(3,80);
this.keyVector.add(kn);
//其中keyAndNum(4,1000020)表示key 4 有1000020个记录;
kn=new keyAndNum(4,1000020);
this.keyVector.add(kn);
kn=new keyAndNum(5,10050);
this.keyVector.add(kn);
kn=new keyAndNum(6,200);
this.keyVector.add(kn);
kn=new keyAndNum(7,100050);
this.keyVector.add(kn);
kn=new keyAndNum(8,250);
this.keyVector.add(kn);
kn=new keyAndNum(9,140);
this.keyVector.add(kn);
kn=new keyAndNum(10,90);
this.keyVector.add(kn);
}
//获取总样本数是;
public int getLoop(){
int loop=0;
for(int i=0;i<this.keyVector.size();i++){
loop+=this.keyVector.get(i).getKeyNum();
}
return loop;
}
//根据initialize生成的信息 生成样本时用;
public boolean minus(int key){
for(int i=0;i<this.keyVector.size();i++){
if(this.keyVector.get(i).getKey()==key){
if(this.keyVector.get(i).getKeyNum()<1){
//System.out.println("key :" +key+"'s num "+this.keyVector.get(i).getKeyNum()+ " ,please check program");
return false;
}else{
this.keyVector.get(i).minus();
return true;
}
}
}
return false;
}
}
//此类是模拟reduce
package com.mxq.balance;
import java.util.Vector;
public class reduceContain {
public Vector<keyAndNum> reduceVec=new Vector<keyAndNum>();
//根本key来收集各个key在某个reducer的数量;
public void addKey(int key){
boolean find=false;
int index=-1;
for(int i=0;i<this.reduceVec.size();i++){
if(this.reduceVec.get(i).getKey()==key)
{
find=true;
index=i;
break;
}
}
if(find){
this.reduceVec.get(index).add();
}else
{
keyAndNum kn=new keyAndNum(key,1);
this.reduceVec.add(kn);
}
}
//获取最终key在某个reducer的样本数
public int getResult(int key){
for(int i=0;i<this.reduceVec.size();i++){
if(this.reduceVec.get(i).getKey()==key){
return this.reduceVec.get(i).getKeyNum();
}
}
System.out.println("can not find key");
return 0;
}
}
//保存key的状态或者配置信息;
package com.mxq.balance;
public class keyAndNum {
int key;
int keyNum;
public keyAndNum(int key,int num){
this.key=key;
this.keyNum=num;
}
public int getKey() {
return key;
}
public void setKey(int key) {
this.key = key;
}
public int getKeyNum() {
return keyNum;
}
public void setKeyNum(int keyNum) {
this.keyNum = keyNum;
}
public void add(){
this.keyNum++;
}
public void minus(){
this.keyNum--;
}
}
package com.mxq.balance;
import java.util.Random;
import java.util.Vector;
public class testPartition {
//配置reducer数量
public int reducerNum=5;
//保存生成的样本数据;
public Vector<Integer> randomData=new Vector<Integer>();
//保存reducer容器的相关状态;
public Vector<keyToReducer> krVec=new Vector<keyToReducer>(reducerNum);
public static void main(String[] args) {
// TODO Auto-generated method stub
Random random=new Random();
int loop=0;
UNBalanceData unbd=new UNBalanceData();
unbd.intialize();
loop=unbd.getLoop();
testPartition tp=new testPartition();
tp.initSampleVec(loop);
tp.initReducer();
int i=0;
while(i<loop){
//System.out.println(random.nextInt(10));
int key=random.nextInt(10)+1;
if(unbd.minus(key))
{
tp.addData(key);
i++;
}
}
for(int h=0;h<tp.randomData.size();h++){
int rid=random.nextInt(tp.reducerNum);
int key=tp.randomData.get(h);
tp.addKeyToReducer(rid, key);
}
tp.statistic();
}
public void initSampleVec(int loop){
this.randomData=new Vector<Integer>(loop);
}
//将key添加样本容器当中;
public void addData(int key){
this.randomData.add(new Integer(key));
}
public void initReducer(){
for(int i=0;i<reducerNum;i++){
keyToReducer kr=new keyToReducer(i);
this.krVec.add(kr);
}
}
//根据partitionid 和Key添加到不同的reducer中
public void addKeyToReducer(int rid,int key){
for(int i=0;i<this.krVec.size();i++){
if(this.krVec.get(i).getRid()==rid){
this.krVec.get(i).getRc().addKey(key);
break;
}
}
}
//统计最终结果;
public void statistic(){
int sum=0;
for(int i=0;i<10;i++){
sum=0;
for(int h=0;h<this.krVec.size();h++){
sum+=this.krVec.get(h).getRc().getResult(i+1);
}
System.out.println("key: "+(i+1)+" has "+sum+" object");
for(int h=0;h<this.krVec.size();h++){
int num=this.krVec.get(h).getRc().getResult(i+1);
System.out.println("in reducer id:"+h+" has key:"+(i+1)+" by "+(float)num/(float)sum);
}
System.out.println();
System.out.println();
}
}
//为每个reducer 配一个ID;
public class keyToReducer{
private int rid;
private reduceContain rc;
public keyToReducer(int id){
this.rid=id;
this.rc=new reduceContain();
}
public int getRid() {
return rid;
}
public void setRid(int rid) {
this.rid = rid;
}
public reduceContain getRc() {
return rc;
}
public void setRc(reduceContain rc) {
this.rc = rc;
}
}
}
测试结果;
其中key的种类固定只有10种;
reducer的数量可配;
reducer数量:5
//key样本信息
keyAndNum kn=new keyAndNum(1,50);
this.keyVector.add(kn);
kn=new keyAndNum(2,100);
this.keyVector.add(kn);
kn=new keyAndNum(3,80);
this.keyVector.add(kn);
kn=new keyAndNum(4,1000020);
this.keyVector.add(kn);
kn=new keyAndNum(5,10050);
this.keyVector.add(kn);
kn=new keyAndNum(6,200);
this.keyVector.add(kn);
kn=new keyAndNum(7,100050);
this.keyVector.add(kn);
kn=new keyAndNum(8,250);
this.keyVector.add(kn);
kn=new keyAndNum(9,140);
this.keyVector.add(kn);
kn=new keyAndNum(10,90);
this.keyVector.add(kn);
//测试结果
key: 1 has 50 object
in reducer id:0 has key:1 by 0.18
in reducer id:1 has key:1 by 0.2
in reducer id:2 has key:1 by 0.2
in reducer id:3 has key:1 by 0.24
in reducer id:4 has key:1 by 0.18
key: 2 has 100 object
in reducer id:0 has key:2 by 0.19
in reducer id:1 has key:2 by 0.18
in reducer id:2 has key:2 by 0.2
in reducer id:3 has key:2 by 0.25
in reducer id:4 has key:2 by 0.18
key: 3 has 80 object
in reducer id:0 has key:3 by 0.2375
in reducer id:1 has key:3 by 0.225
in reducer id:2 has key:3 by 0.2
in reducer id:3 has key:3 by 0.2375
in reducer id:4 has key:3 by 0.1
key: 4 has 1000020 object
in reducer id:0 has key:4 by 0.19913201
in reducer id:1 has key:4 by 0.200015
in reducer id:2 has key:4 by 0.199976
in reducer id:3 has key:4 by 0.19990501
in reducer id:4 has key:4 by 0.20097198
key: 5 has 10050 object
in reducer id:0 has key:5 by 0.19641791
in reducer id:1 has key:5 by 0.20199005
in reducer id:2 has key:5 by 0.1999005
in reducer id:3 has key:5 by 0.20109452
in reducer id:4 has key:5 by 0.20059702
key: 6 has 200 object
in reducer id:0 has key:6 by 0.17
in reducer id:1 has key:6 by 0.205
in reducer id:2 has key:6 by 0.21
in reducer id:3 has key:6 by 0.19
in reducer id:4 has key:6 by 0.225
key: 7 has 100050 object
in reducer id:0 has key:7 by 0.2012194
in reducer id:1 has key:7 by 0.19902049
in reducer id:2 has key:7 by 0.20127936
in reducer id:3 has key:7 by 0.19898051
in reducer id:4 has key:7 by 0.19950025
key: 8 has 250 object
in reducer id:0 has key:8 by 0.18
in reducer id:1 has key:8 by 0.208
in reducer id:2 has key:8 by 0.212
in reducer id:3 has key:8 by 0.192
in reducer id:4 has key:8 by 0.208
key: 9 has 140 object
in reducer id:0 has key:9 by 0.20714286
in reducer id:1 has key:9 by 0.23571429
in reducer id:2 has key:9 by 0.19285715
in reducer id:3 has key:9 by 0.16428572
in reducer id:4 has key:9 by 0.2
key: 10 has 90 object
in reducer id:0 has key:10 by 0.2
in reducer id:1 has key:10 by 0.17777778
in reducer id:2 has key:10 by 0.22222222
in reducer id:3 has key:10 by 0.21111111
in reducer id:4 has key:10 by 0.18888889
reducer数量:10
key: 1 has 50 object
in reducer id:0 has key:1 by 0.1
in reducer id:1 has key:1 by 0.1
in reducer id:2 has key:1 by 0.14
in reducer id:3 has key:1 by 0.06
in reducer id:4 has key:1 by 0.16
in reducer id:5 has key:1 by 0.06
in reducer id:6 has key:1 by 0.02
in reducer id:7 has key:1 by 0.12
in reducer id:8 has key:1 by 0.12
in reducer id:9 has key:1 by 0.12
key: 2 has 100 object
in reducer id:0 has key:2 by 0.1
in reducer id:1 has key:2 by 0.08
in reducer id:2 has key:2 by 0.11
in reducer id:3 has key:2 by 0.11
in reducer id:4 has key:2 by 0.1
in reducer id:5 has key:2 by 0.12
in reducer id:6 has key:2 by 0.13
in reducer id:7 has key:2 by 0.07
in reducer id:8 has key:2 by 0.1
in reducer id:9 has key:2 by 0.08
key: 3 has 80 object
in reducer id:0 has key:3 by 0.0375
in reducer id:1 has key:3 by 0.0875
in reducer id:2 has key:3 by 0.075
in reducer id:3 has key:3 by 0.125
in reducer id:4 has key:3 by 0.0625
in reducer id:5 has key:3 by 0.1625
in reducer id:6 has key:3 by 0.1125
in reducer id:7 has key:3 by 0.1
in reducer id:8 has key:3 by 0.1375
in reducer id:9 has key:3 by 0.1
key: 4 has 1000020 object
in reducer id:0 has key:4 by 0.099913
in reducer id:1 has key:4 by 0.100039
in reducer id:2 has key:4 by 0.09953101
in reducer id:3 has key:4 by 0.099645
in reducer id:4 has key:4 by 0.100186
in reducer id:5 has key:4 by 0.099819005
in reducer id:6 has key:4 by 0.100185
in reducer id:7 has key:4 by 0.100311995
in reducer id:8 has key:4 by 0.100187995
in reducer id:9 has key:4 by 0.100182
key: 5 has 10050 object
in reducer id:0 has key:5 by 0.100497514
in reducer id:1 has key:5 by 0.10179105
in reducer id:2 has key:5 by 0.10597015
in reducer id:3 has key:5 by 0.09960199
in reducer id:4 has key:5 by 0.094228856
in reducer id:5 has key:5 by 0.10079602
in reducer id:6 has key:5 by 0.10258707
in reducer id:7 has key:5 by 0.09860697
in reducer id:8 has key:5 by 0.09542289
in reducer id:9 has key:5 by 0.100497514
key: 6 has 200 object
in reducer id:0 has key:6 by 0.11
in reducer id:1 has key:6 by 0.095
in reducer id:2 has key:6 by 0.08
in reducer id:3 has key:6 by 0.105
in reducer id:4 has key:6 by 0.1
in reducer id:5 has key:6 by 0.09
in reducer id:6 has key:6 by 0.1
in reducer id:7 has key:6 by 0.105
in reducer id:8 has key:6 by 0.125
in reducer id:9 has key:6 by 0.09
key: 7 has 100050 object
in reducer id:0 has key:7 by 0.10067966
in reducer id:1 has key:7 by 0.10117941
in reducer id:2 has key:7 by 0.099550225
in reducer id:3 has key:7 by 0.097891055
in reducer id:4 has key:7 by 0.10011994
in reducer id:5 has key:7 by 0.10177911
in reducer id:6 has key:7 by 0.10029985
in reducer id:7 has key:7 by 0.09991004
in reducer id:8 has key:7 by 0.09992004
in reducer id:9 has key:7 by 0.09867066
key: 8 has 250 object
in reducer id:0 has key:8 by 0.108
in reducer id:1 has key:8 by 0.06
in reducer id:2 has key:8 by 0.112
in reducer id:3 has key:8 by 0.12
in reducer id:4 has key:8 by 0.108
in reducer id:5 has key:8 by 0.092
in reducer id:6 has key:8 by 0.084
in reducer id:7 has key:8 by 0.132
in reducer id:8 has key:8 by 0.112
in reducer id:9 has key:8 by 0.072
key: 9 has 140 object
in reducer id:0 has key:9 by 0.07857143
in reducer id:1 has key:9 by 0.1
in reducer id:2 has key:9 by 0.10714286
in reducer id:3 has key:9 by 0.14285715
in reducer id:4 has key:9 by 0.08571429
in reducer id:5 has key:9 by 0.12142857
in reducer id:6 has key:9 by 0.092857145
in reducer id:7 has key:9 by 0.057142857
in reducer id:8 has key:9 by 0.08571429
in reducer id:9 has key:9 by 0.12857144
key: 10 has 90 object
in reducer id:0 has key:10 by 0.1
in reducer id:1 has key:10 by 0.08888889
in reducer id:2 has key:10 by 0.06666667
in reducer id:3 has key:10 by 0.07777778
in reducer id:4 has key:10 by 0.1
in reducer id:5 has key:10 by 0.16666667
in reducer id:6 has key:10 by 0.11111111
in reducer id:7 has key:10 by 0.13333334
in reducer id:8 has key:10 by 0.08888889
in reducer id:9 has key:10 by 0.06666667
总结
根据配置信息,key 4有1000020 个记录,是样本量最多的key,但测试结果显示key 4在各个reducer上是分布均匀的
最后
以上就是粗暴蜜粉为你收集整理的mapreduce任务中数据分布倾斜导致reduce负载不均衡的解决方案的全部内容,希望文章能够帮你解决mapreduce任务中数据分布倾斜导致reduce负载不均衡的解决方案所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复