概述
对一些有指定分隔符的数据,按照对应列数进行自定义排序
原始数据:
- hadoop@sh-hadoop:more sourText.txt
- hadoop|234|2346|sdfasdgadfgdfg
- spark|534|65745|fhsdfghdfgh
- hive|65|6585|shsfghfgh
- hbase|98|456|jhgjdfghj
- tachyon|345|567|sfhrtyhert
- kafka|455|567|dghrtyh
- storm|86|345|dgsdfg
- redis|45|56|ergerg
- sqoop|45|765|fghd
- flume|34|67|sdfgrty
- oozie|23|45|adfgdfg
- pig|54|456|dfg
- zookeeper|23|543|dfgd
- solr|75|54|ertgergt
1、用Mr进行排序,按照第2列进行降序排序:
- hadoop@sh-hadoop:/home/hadoop/blb$ hdfs dfs -text /user/hadoop/libin/input/sourText.txt | wc -l
- 14
- hadoop@sh-hadoop:/home/hadoop/blb$ hdfs dfs -text /user/hadoop/libin/Domain800_level2/merge1/out1/* | wc -l
- 14
- hadoop@sh-hadoop:/home/hadoop/blb$ hdfs dfs -text /user/hadoop/libin/Domain800_level2/merge1/out1/* | more
- spark|534|65745|fhsdfghdfgh
- kafka|455|567|dghrtyh
- tachyon|345|567|sfhrtyhert
- hadoop|234|2346|sdfasdgadfgdfg
- hbase|98|456|jhgjdfghj
- storm|86|345|dgsdfg
- solr|75|54|ertgergt
- hive|65|6585|shsfghfgh
- pig|54|456|dfg
- redis|45|56|ergerg
- sqoop|45|765|fghd
- flume|34|67|sdfgrty
- oozie|23|45|adfgdfg
- zookeeper|23|543|dfgd
- hadoop@sh-hadoop:/home/hadoop/blb$
2、用shell命令进行统计:
-r:sort默认的排序方式是升序,如果想改成降序,加个-r就搞定了。
-n:就要使用-n选项,来告诉sort,“要以数值来排序”!
-t:sort提供了-t选项,后面可以设定间隔符。
-k:指定了间隔符之后,就可以用-k来指定列数了。
2.1、按照第二列进行降序排序:
sort -t "|" -nrk2 sourText.txt
- hadoop@sh-hadoop:/home/hadoop/blb$ sort -t "|" -nrk2 sourText.txt
- spark|534|65745|fhsdfghdfgh
- kafka|455|567|dghrtyh
- tachyon|345|567|sfhrtyhert
- hadoop|234|2346|sdfasdgadfgdfg
- hbase|98|456|jhgjdfghj
- storm|86|345|dgsdfg
- solr|75|54|ertgergt
- hive|65|6585|shsfghfgh
- pig|54|456|dfg
- sqoop|45|765|fghd
- redis|45|56|ergerg
- flume|34|67|sdfgrty
- zookeeper|23|543|dfgd
- oozie|23|45|adfgdfg
2.2、按照第三列进行降序排序:
- hadoop@sh-hadoop:/home/hadoop/blb$ sort -t "|" -nrk3 sourText.txt
- spark|534|65745|fhsdfghdfgh
- hive|65|6585|shsfghfgh
- hadoop|234|2346|sdfasdgadfgdfg
- sqoop|45|765|fghd
- tachyon|345|567|sfhrtyhert
- kafka|455|567|dghrtyh
- zookeeper|23|543|dfgd
- pig|54|456|dfg
- hbase|98|456|jhgjdfghj
- storm|86|345|dgsdfg
- flume|34|67|sdfgrty
- redis|45|56|ergerg
- solr|75|54|ertgergt
- oozie|23|45|adfgdfg
排序后倒入新文件中:
sort -t "|" -nrk2 part-r-00000 |more > merge.txt
附录:
MapReduce实现代码:
- import java.io.DataInput;
- import java.io.DataOutput;
- import java.io.IOException;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.NullWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.io.WritableComparable;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
- import org.apache.hadoop.util.GenericOptionsParser;
- import mapreduce.SegmentUtil;
- public class Domain_merge {
- public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
- Configuration conf = new Configuration();
- String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
- if (otherArgs.length != 2) {
- System.err.println("Usage Domain800_level2 <input> <输出结果>");
- System.exit(2);
- }
- Job job4 = Job.getInstance(conf, Domain_merge.class.getSimpleName());
- job4.setJarByClass(Domain_merge.class);
- job4.setMapOutputKeyClass(Toptaobao500.class);
- job4.setMapOutputValueClass(Text.class);
- job4.setOutputKeyClass(Text.class);
- job4.setOutputValueClass(NullWritable.class);
- //job4.setPartitionerClass(MyPartitioner.class);
- job4.setMapperClass(MyMapper2.class);
- job4.setNumReduceTasks(1);
- job4.setReducerClass(MyReducer2.class);
- job4.setInputFormatClass(TextInputFormat.class);
- job4.setOutputFormatClass(TextOutputFormat.class);
- FileInputFormat.addInputPath(job4, new Path(otherArgs[0]));
- FileOutputFormat.setOutputPath(job4, new Path(otherArgs[1]));
- job4.waitForCompletion(true);
- }
- /**
- * 第二个Job排序
- */
- public static class MyMapper2 extends Mapper<LongWritable, Text, Toptaobao500, Text>{
- Toptaobao500 mw=new Toptaobao500();
- @Override
- protected void map(LongWritable key, Text value,
- Mapper<LongWritable, Text, Toptaobao500, Text>.Context context)
- throws IOException, InterruptedException {
- String[] spl=value.toString().split("\|");
- String trait=spl[0].trim();
- String uv=spl[1].trim();
- String pv=spl[2].trim();
- String fenlei=spl[3].trim();
- mw.setkind(trait+"|"+uv+"|"+pv+"|"+fenlei);
- mw.setCount(Long.parseLong(uv.trim()));
- context.write(mw, new Text(value));
- }
- }
- public static class MyReducer2 extends Reducer<Toptaobao500, Text, Text, NullWritable>{
- @Override
- protected void reduce(Toptaobao500 k4, Iterable<Text> v4s, Reducer<Toptaobao500, Text, Text, NullWritable>.Context context)
- throws IOException, InterruptedException {
- for (Text v4 : v4s) {
- context.write(v4, NullWritable.get());
- }
- }
- }
- public static class Toptaobao500 implements WritableComparable<Toptaobao500> {
- String kind;
- Long count;
- public Toptaobao500() {
- }
- public Toptaobao500(String kind, Long count) {
- this.kind = kind;
- this.count = count;
- }
- public void setkind(String kind) {
- this.kind = kind;
- }
- public void setCount(Long l) {
- this.count = l;
- }
- public String getKind() {
- return this.kind;
- }
- public Long getCount() {
- return this.count;
- }
- @Override
- public void write(DataOutput out) throws IOException {
- out.writeUTF(kind);
- out.writeLong(count);
- }
- @Override
- public void readFields(DataInput in) throws IOException {
- this.kind = in.readUTF();
- this.count = in.readLong();
- }
- @Override
- public int compareTo(Toptaobao500 o) {
- long temp=this.count-o.count;
- if(temp>0){
- temp=-1;
- return (int) temp;
- }else if(temp<0){
- temp=1;
- return (int) temp;
- }
- return (int) (this.count-o.count);
- }
- @Override
- public boolean equals(Object obj) {
- return super.equals(obj);
- }
- @Override
- public int hashCode() {
- return super.hashCode();
- }
- @Override
- public String toString() {
- return this.kind;
- }
- }
- }
最后
以上就是负责黄豆为你收集整理的数据按列排序的全部内容,希望文章能够帮你解决数据按列排序所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复