Hadoop基础-HDFS递归列出文件系统-FileStatus与listFiles两种方法

66 阅读 0 评论 44 点赞

我是靠谱客的博主怕孤单天空，这篇文章主要介绍Hadoop基础-HDFS递归列出文件系统-FileStatus与listFiles两种方法，现在分享给大家，希望可以做个参考。

　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　作者：尹正杰

　　fs.listFiles方法，返回LocatedFileStatus的迭代器，自带递归。但是它是继承于FileStatus的，而且构建函数是FileStatus的文件版，即LocaledFileStatus只能列出文件。接下来我我们一起看看这两个方法的用法。

一.listStatus方法

 1 /*
 2 @author :yinzhengjie
 3 Blog:http://www.cnblogs.com/yinzhengjie/tag/Hadoop%E8%BF%9B%E9%98%B6%E4%B9%8B%E8%B7%AF/
 4 EMAIL:y1053419035@qq.com
 5 */
 6 package cn.org.yinzhengjie.day01.note1;
 7 
 8 import org.apache.hadoop.conf.Configuration;
 9 import org.apache.hadoop.fs.FSDataInputStream;
10 import org.apache.hadoop.fs.FileStatus;
11 import org.apache.hadoop.fs.FileSystem;
12 import org.apache.hadoop.fs.Path;
13 import org.apache.hadoop.io.IOUtils;
14 
15 import java.io.FileOutputStream;
16 import java.io.IOException;
17 
18 public class HdfsDemo2 {
19     public static void main(String[] args) throws IOException {
20         list();
21         System.out.println("======  我是分割线  ========");
22         tree("/shell");
23     }
24 
25     //查看指定路径的树形结构，类似于Linux的tree命令。
26     private static void tree(String srcPath) throws IOException {
27         //由于我的Hadoop完全分布式根目录对yinzhengjie以外的用户(尽管是root用户也没有写入权限哟！因为是hdfs系统，并非Linux系统！)没有写入
28         // 权限，所以需要手动指定当前用户权限。使用“HADOOP_USER_NAME”属性就可以轻松搞定！
29         System.setProperty("HADOOP_USER_NAME","yinzhengjie");
30         //实例化一个Configuration，它会自动去加载本地的core-site.xml配置文件的fs.defaultFS属性。(该文件放在项目的resources目录即可。)
31         Configuration conf = new Configuration();
32         //代码的入口点，初始化HDFS文件系统，此时我们需要把读取到的fs.defaultFS属性传给fs对象。
33         FileSystem fs = FileSystem.get(conf);
34         //这个path是指是需要在文件系统中写入的数据,里面的字符串可以写出“hdfs://s101:8020/shell”，但由于core-site.xml配置
35         // 文件中已经有“hdfs://s101:8020”字样的前缀，因此我们这里可以直接写相对路径即可
36         Path path = new Path(srcPath);
37         //通过fs的listStatus方法获取一个指定path的所有文件信息(status)，因此我们需要传入一个hdfs的路径，返回的是一个filStatus数组
38         FileStatus[] fileStatuses = fs.listStatus(path);
39         for (FileStatus fileStatus : fileStatuses) {
40             //判断当前迭代对象是否是目录
41             if (fileStatus.isDirectory()){
42                 String dirPath = fileStatus.getPath().toString();
43                 System.out.println("文件夹名:" + fileStatus.getPath());
44                 tree(dirPath);
45             }else {
46                 System.out.println("文件名:" + fileStatus.getPath());
47             }
48         }
49 
50 
51     }
52 
53     //查看指定路径下的所有文件
54     private static void list() throws IOException {
55         //由于我的Hadoop完全分布式根目录对yinzhengjie以外的用户(尽管是root用户也没有写入权限哟！因为是hdfs系统，并非Linux系统！)没有写入
56         // 权限，所以需要手动指定当前用户权限。使用“HADOOP_USER_NAME”属性就可以轻松搞定！
57         System.setProperty("HADOOP_USER_NAME","yinzhengjie");
58         //实例化一个Configuration，它会自动去加载本地的core-site.xml配置文件的fs.defaultFS属性。(该文件放在项目的resources目录即可。)
59         Configuration conf = new Configuration();
60         //代码的入口点，初始化HDFS文件系统，此时我们需要把读取到的fs.defaultFS属性传给fs对象。
61         FileSystem fs = FileSystem.get(conf);
62         //这个path是指是需要在文件系统中写入的数据,里面的字符串可以写出“hdfs://s101:8020/shell”，但由于core-site.xml配置
63         // 文件中已经有“hdfs://s101:8020”字样的前缀，因此我们这里可以直接写相对路径即可
64         Path path = new Path("/shell");
65         //通过fs的listStatus方法获取一个指定path的所有文件信息(status)，因此我们需要传入一个hdfs的路径，返回的是一个filStatus数组
66         FileStatus[] fileStatuses = fs.listStatus(path);
67         for (FileStatus fileStatus : fileStatuses) {
68             //判断当前迭代对象是否是目录
69             boolean isDir = fileStatus.isDirectory();
70             //获取当前文件的绝对路径
71             String fullPath = fileStatus.getPath().toString();
72             System.out.println("isDir:" + isDir + ",Path:" + fullPath);
73         }
74     }
75 }
76 
77 /*
78 以上代码执行结果如下：
79 isDir:true,Path:hdfs://s101:8020/shell/awk
80 isDir:true,Path:hdfs://s101:8020/shell/grep
81 isDir:true,Path:hdfs://s101:8020/shell/sed
82 isDir:false,Path:hdfs://s101:8020/shell/yinzhengjie.sh
83 ======  我是分割线  ========
84 文件夹名:hdfs://s101:8020/shell/awk
85 文件名:hdfs://s101:8020/shell/awk/keepalive.sh
86 文件名:hdfs://s101:8020/shell/awk/nginx.conf
87 文件夹名:hdfs://s101:8020/shell/grep
88 文件名:hdfs://s101:8020/shell/grep/1.txt
89 文件名:hdfs://s101:8020/shell/grep/2.txt
90 文件夹名:hdfs://s101:8020/shell/sed
91 文件名:hdfs://s101:8020/shell/sed/nagios.sh
92 文件名:hdfs://s101:8020/shell/sed/zabbix.sql
93 文件名:hdfs://s101:8020/shell/yinzhengjie.sh
94  */

二.listFiles方法

 1 /*
 2 @author :yinzhengjie
 3 Blog:http://www.cnblogs.com/yinzhengjie/tag/Hadoop%E8%BF%9B%E9%98%B6%E4%B9%8B%E8%B7%AF/
 4 EMAIL:y1053419035@qq.com
 5 */
 6 package cn.org.yinzhengjie.day01.note1;
 7 
 8 import org.apache.hadoop.conf.Configuration;
 9 import org.apache.hadoop.fs.*;
10 import org.apache.hadoop.io.IOUtils;
11 
12 import java.io.FileOutputStream;
13 import java.io.IOException;
14 
15 public class HdfsDemo3 {
16     public static void main(String[] args) throws IOException {
17         autoList("/shell");
18     }
19 
20     //定义方法下载文件到本地
21     private static void autoList(String path) throws IOException {
22         //实例化一个Configuration，它会自动去加载本地的core-site.xml配置文件的fs.defaultFS属性。(该文件放在项目的resources目录即可。)
23         Configuration conf = new Configuration();
24         //代码的入口点，初始化HDFS文件系统，此时我们需要把读取到的fs.defaultFS属性传给fs对象。
25         FileSystem fs = FileSystem.get(conf);
26         //通过fs的listFiles方法可以自动实现递归(自带递归)列出文件类型，返回的是一个远程可迭代对象,需要传入两个参数，第一个参数是服务器路径，第二个参数是否递归
27         RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(new Path(path), true);
28         while (iterator.hasNext()){
29             LocatedFileStatus fileStatus = iterator.next();
30             Path fullPath = fileStatus.getPath();
31             System.out.println(fullPath);
32         }
33     }
34 }
35 
36 /*
37 以上代码执行结果如下：
38 hdfs://s101:8020/shell/awk/keepalive.sh
39 hdfs://s101:8020/shell/awk/nginx.conf
40 hdfs://s101:8020/shell/grep/1.txt
41 hdfs://s101:8020/shell/grep/2.txt
42 hdfs://s101:8020/shell/sed/nagios.sh
43 hdfs://s101:8020/shell/sed/zabbix.sql
44 hdfs://s101:8020/shell/yinzhengjie.sh
45  */