当下载的日志文件(文本文件)有几十M大小的时候,直接用文本编辑器(notepad++)打开会导致卡死。于是写了一个按字节数均分的文本分割工具TXTSpliterEqualBytes.java ,将文本文件分割成10份(比如原文件50M,分割后生成子文件每个5M)。 但执行TXTSpliterEqualBytes时可能会遇到一个问题:从第N份子文件开始统统是乱码。原因是按字节均分恰好出现将某个字符(占用超过1个字节)分割的情况。于是又写了一个按字符数均分的文本分割工具TXTSpliterEqualChars.java(比如原文件1千万个字符,分割后生成的每个子文件有1百万字符) 。 下载地址:https://download.csdn.net/download/shushanke/86923522 --------------------------------分割线-------------------------------- TXTSpliterEqualBytes 复制代码1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.CharacterCodingException;
import java.text.DecimalFormat;
/*
javac -d . -encoding UTF-8 TXTSpliterEqualBytes.java
java TXTSpliterEqualBytes
文本切割器(按字节数均分,可能分割后的文件乱码。比如恰好某个字符不止一个字节,恰好好被分割到两个文件中。)
*/
public class TXTSpliterEqualBytes {
private static final String dirPath = ".";//当前目录
//private static final int NUMBER_OF_FILES = 10;//分割成N份
private static int NUMBER_OF_FILES = 10;//分割成N份
private static String absoluteDirPath = "";
//原始文件
private static String originalFileName = "";
private static DecimalFormat format;
private static java.util.LinkedHashSet<String> suffixSetOfTXTFile = new java.util.LinkedHashSet<String>();
static {
suffixSetOfTXTFile.add(".log");
suffixSetOfTXTFile.add(".LOG");
suffixSetOfTXTFile.add(".txt");
suffixSetOfTXTFile.add(".TXT");
suffixSetOfTXTFile.add(".text");
suffixSetOfTXTFile.add(".TEXT");
if (NUMBER_OF_FILES < 10) {
format = new DecimalFormat("0");
} else if (NUMBER_OF_FILES < 100) {
format = new DecimalFormat("00");
} else if (NUMBER_OF_FILES < 1000) {
format = new DecimalFormat("000");
}
getabsoluteDirPath();//计算当前目录的绝对路径
findTXTFile();//查找文本文件(找到当前目录的第一个文本)
}
private static String getabsoluteDirPath() {
if ("".equals(absoluteDirPath)) {
File dir = new File(dirPath);
absoluteDirPath = dir.getAbsolutePath();
absoluteDirPath = absoluteDirPath.substring(0, absoluteDirPath.length() -1);
//System.out.println("absoluteDirPath==" + absoluteDirPath);
if (!absoluteDirPath.endsWith(File.separator)) {
absoluteDirPath += File.separator;
}
}
return absoluteDirPath;
}
private static String findTXTFile() {
File dir = new File(absoluteDirPath);
boolean findTXT = false;
for (File file : dir.listFiles()) {
if (file.isFile()) {
String fileName = file.getName();
int index = fileName.lastIndexOf(".");
if (index < 1) {
continue;
}
String suffix = fileName.substring(index, fileName.length());
if (suffixSetOfTXTFile.contains(suffix)) {
originalFileName = fileName;
findTXT = true;
break;
}
}
}
if (!findTXT) {
String tipMsg = "ERROR:请将待分割的文本文件" + suffixSetOfTXTFile.toString() + "放到当前目录下!";
System.out.println(tipMsg);
throw new RuntimeException(tipMsg);
}
return absoluteDirPath;
}
public static void closeCloseable(Closeable closeable) {
try {
if (closeable != null) {
closeable.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static boolean split() {
boolean success = false;
if (NUMBER_OF_FILES < 2) {
System.out.println("分割后的文件个数不能小于2!");
return success;
}
//文件的绝对路径
String filePath = absoluteDirPath + originalFileName;
File originalFile = new File(filePath);
long sizeTotal = originalFile.length();
long sizeEach = sizeTotal / NUMBER_OF_FILES;
long remainder = sizeTotal % NUMBER_OF_FILES;
long[] sizeArray = new long[NUMBER_OF_FILES];
for (int i = 0; i < NUMBER_OF_FILES; i++) {
sizeArray[i] = sizeEach;
}
sizeArray[NUMBER_OF_FILES -1] = sizeEach + remainder;
FileChannel inChannel = null;
FileChannel outChannel = null;
try {
int index = originalFileName.lastIndexOf(".");
String fileName = originalFileName.substring(0, index);
String suffix = originalFileName.substring(index, originalFileName.length());
StringBuilder sb = new StringBuilder();
inChannel = new FileInputStream(originalFile).getChannel();
long offset = 0;
for (int i = 0; i < NUMBER_OF_FILES; i++) {
sb.setLength(0);
sb.append(fileName).append("_").append(format.format(i + 1)).append(suffix);
String newFileName = absoluteDirPath + sb.toString();
long byteNum = sizeArray[i];
// 将FileChannel里的全部数据映射到ByteBuffer里
MappedByteBuffer buffer = inChannel.map(FileChannel.MapMode.READ_ONLY, offset, byteNum);// ①
offset += byteNum;
// 创建FileOutputStream,以该文件输出流创建FileChannel
outChannel = new FileOutputStream(newFileName).getChannel();
// 直接将buffer里的数据全部输出
outChannel.write(buffer);// ②
buffer.clear();//position=0,limit=capacity
/*
// 使用GBK/UTF-8字符集来创建解码器
Charset charset = Charset.forName("UTF-8");
// 创建解码器(CharsetDecoder)对象
CharsetDecoder decoder = charset.newDecoder();
// 使用解码器将ByteBuffer转换成CharBuffer
CharBuffer charBuffer = decoder.decode(buffer);
int capacity = charBuffer.capacity();
int limit = charBuffer.limit();
// true - false, 因为字节数大于字符数(含中文字符)
System.out.println((file.length() == capacity) + " - " + (capacity == limit));
System.out.println(charBuffer);//输出文件内容
*/
}//end of for-loop
success = true;
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (CharacterCodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
//MyUtil.closeFileChannel(inChannel);
//MyUtil.closeFileChannel(outChannel);
closeCloseable(inChannel);
closeCloseable(outChannel);
}
return success;
}
public static void main(String... args)throws Exception {
System.out.println("①输入exit并敲回车,结束程序。");
System.out.println("②输入大于1的整数(N)并敲回车,将文本分割成N分。");
//try-with-resource语法
try (BufferedReader bufReader = new BufferedReader(new InputStreamReader(System.in));){
String line = null;
while ((line = bufReader.readLine()) != null) {
System.out.println("本次输入的内容是:" + line);
if (line.equalsIgnoreCase("exit")) {
break;
} else {
try {
int count = Integer.parseInt(line);
if (count < 2) {
System.out.println("请输入大于1的整数:");
} else {
NUMBER_OF_FILES = count;
System.out.println("文本将分割成" + NUMBER_OF_FILES + "份");
long start = System.currentTimeMillis();
boolean success = split();
long end = System.currentTimeMillis();
if (success) {
System.out.println("文本分割已完成,耗时(ms)=" + (end -start));
break;
}
}
} catch (NumberFormatException e) {
System.out.println("请输入大于1的整数:");
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
--------------------------------分割线-------------------------------- TXTSpliterEqualChars 复制代码1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.text.DecimalFormat;
/*
javac -d . -encoding UTF-8 TXTSpliterEqualChars.java
java TXTSpliterEqualChars
文本切割器(按字符数均分)
*/
public class TXTSpliterEqualChars {
private static final String dirPath = ".";//当前目录
//private static final int NUMBER_OF_FILES = 10;//分割成N份
private static int NUMBER_OF_FILES = 10;//分割成N份
private static String absoluteDirPath = "";
public static Charset CHARSET_UTF8 = Charset.forName("UTF-8");// UTF-8字符集,创建解码器/编码器的字符集
public static Charset CHARSET_GBK = Charset.forName("GBK");// GBK字符集,创建解码器/编码器的字符集
//原始文件
private static String originalFileName = "";
private static DecimalFormat format;
private static java.util.LinkedHashSet<String> suffixSetOfTXTFile = new java.util.LinkedHashSet<String>();
static {
suffixSetOfTXTFile.add(".log");
suffixSetOfTXTFile.add(".LOG");
suffixSetOfTXTFile.add(".txt");
suffixSetOfTXTFile.add(".TXT");
suffixSetOfTXTFile.add(".text");
suffixSetOfTXTFile.add(".TEXT");
if (NUMBER_OF_FILES < 10) {
format = new DecimalFormat("0");
} else if (NUMBER_OF_FILES < 100) {
format = new DecimalFormat("00");
} else if (NUMBER_OF_FILES < 1000) {
format = new DecimalFormat("000");
}
getabsoluteDirPath();//计算当前目录的绝对路径
findTXTFile();//查找文本文件(找到当前目录的第一个文本)
}
private static String getabsoluteDirPath() {
if ("".equals(absoluteDirPath)) {
File dir = new File(dirPath);
absoluteDirPath = dir.getAbsolutePath();
absoluteDirPath = absoluteDirPath.substring(0, absoluteDirPath.length() -1);
//System.out.println("absoluteDirPath==" + absoluteDirPath);
if (!absoluteDirPath.endsWith(File.separator)) {
absoluteDirPath += File.separator;
}
}
return absoluteDirPath;
}
private static String findTXTFile() {
File dir = new File(absoluteDirPath);
boolean findTXT = false;
for (File file : dir.listFiles()) {
if (file.isFile()) {
String fileName = file.getName();
int index = fileName.lastIndexOf(".");
if (index < 1) {
continue;
}
String suffix = fileName.substring(index, fileName.length());
if (suffixSetOfTXTFile.contains(suffix)) {
originalFileName = fileName;
findTXT = true;
break;
}
}
}
if (!findTXT) {
String tipMsg = "ERROR:请将待分割的文本文件" + suffixSetOfTXTFile.toString() + "放到当前目录下!";
System.out.println(tipMsg);
throw new RuntimeException(tipMsg);
}
return absoluteDirPath;
}
public static void closeCloseable(Closeable closeable) {
try {
if (closeable != null) {
closeable.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static boolean split() {
boolean success = false;
if (NUMBER_OF_FILES < 2) {
System.out.println("分割后的文件个数不能小于2!");
return success;
}
//文件的绝对路径
String filePath = absoluteDirPath + originalFileName;
File originalFile = new File(filePath);
long sizeTotal = originalFile.length();
FileChannel inChannel = null;
FileChannel outChannel = null;
try {
int index = originalFileName.lastIndexOf(".");
String fileName = originalFileName.substring(0, index);
String suffix = originalFileName.substring(index, originalFileName.length());
StringBuilder sb = new StringBuilder();
inChannel = new FileInputStream(originalFile).getChannel();
MappedByteBuffer byteBuffer = inChannel.map(FileChannel.MapMode.READ_ONLY, 0, sizeTotal);
// 创建解码器(CharsetDecoder)对象
CharsetDecoder decoder = CHARSET_UTF8.newDecoder();
// 使用解码器将ByteBuffer转换成CharBuffer
CharBuffer charBuffer = decoder.decode(byteBuffer);
//int capacity = charBuffer.capacity();//字节数
int limit = charBuffer.limit();//字符数?
char[] chars = charBuffer.array();
// 创建编码器(CharsetEncoder)对象
CharsetEncoder encoder = CHARSET_UTF8.newEncoder();
long charNumTotal = limit;
long charNumEach = charNumTotal / NUMBER_OF_FILES;
long charRemainder = charNumTotal % NUMBER_OF_FILES;
long[] charNumArray = new long[NUMBER_OF_FILES];
for (int i = 0; i < NUMBER_OF_FILES; i++) {
charNumArray[i] = charNumEach;
}
charNumArray[NUMBER_OF_FILES -1] = charNumEach + charRemainder;
System.out.println("byteNumTotal=" + sizeTotal);
System.out.println("charNumTotal=" + charNumTotal + ", charNumEach=" + charNumEach + ", charRemainder=" + charRemainder);
System.out.println("charBuffer.array().length=" + chars.length);
long offset = 0;
for (int i = 0; i < NUMBER_OF_FILES; i++) {
sb.setLength(0);
sb.append(fileName).append("_").append(format.format(i + 1)).append(suffix);
String newFileName = absoluteDirPath + sb.toString();
long charNum = charNumArray[i];
System.out.println("from " + offset + " to " + (offset + charNum) + ", charNum=" + charNum + ", charBuffer.remaining()=" + charBuffer.remaining() );
CharBuffer cBuffer = CharBuffer.wrap(chars, (int) offset, (int) charNum);
//System.out.println("cBuffer=" + cBuffer);//文本内容
offset += charNum;
// 使用编码器将CharBuffer转换成ByteBuffer
ByteBuffer bBuffer = encoder.encode(cBuffer);
// 创建FileOutputStream,以该文件输出流创建FileChannel
outChannel = new FileOutputStream(newFileName).getChannel();
// 直接将buffer里的数据全部输出
outChannel.write(bBuffer);// ②
bBuffer.clear();//position=0,limit=capacity
}//end of for-loop
success = true;
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (CharacterCodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
//MyUtil.closeFileChannel(inChannel);
//MyUtil.closeFileChannel(outChannel);
closeCloseable(inChannel);
closeCloseable(outChannel);
}
return success;
}
public static void main(String... args)throws Exception {
System.out.println("①输入exit并敲回车,结束程序。");
System.out.println("②输入大于1的整数(N)并敲回车,将文本分割成N分。");
//try-with-resource语法
try (BufferedReader bufReader = new BufferedReader(new InputStreamReader(System.in));){
String line = null;
while ((line = bufReader.readLine()) != null) {
System.out.println("本次输入的内容是:" + line);
if (line.equalsIgnoreCase("exit")) {
break;
} else {
try {
int count = Integer.parseInt(line);
if (count < 2) {
System.out.println("请输入大于1的整数:");
} else {
NUMBER_OF_FILES = count;
System.out.println("文本将分割成" + NUMBER_OF_FILES + "份");
long start = System.currentTimeMillis();
boolean success = split();
long end = System.currentTimeMillis();
if (success) {
System.out.println("文本分割已完成,耗时(ms)=" + (end -start));
break;
}
}
} catch (NumberFormatException e) {
System.out.println("请输入大于1的整数:");
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
--------------------------------分割线-------------------------------- 运行环境:JDK 1.7、1.8 windows可执行文件(*.bat) TXTSpliterEqualChars.bat,内容如下: javac -d . -encoding UTF-8 TXTSpliterEqualChars.java java TXTSpliterEqualChars :pause TXTSpliterEqualBytes.bat,内容如下: javac -d . -encoding UTF-8 TXTSpliterEqualBytes.java java TXTSpliterEqualBytes :pause 在同目录下放入待分割的文本文件,然后双击可执行文件:  --------------------------------分割线-------------------------------- |
发表评论 取消回复