本文的java程序用于读取一篇英语文章中单词出现的次数,基于字节实现,整个统计过程不需要转换为实际字符:
package jaas;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
public class WordAnalyzer {
private Map<ByteArrayWrapper, AtomicInteger> resultMap = new ConcurrentHashMap<>();
public class ByteArrayWrapper {
private byte[] bytes;
public ByteArrayWrapper(byte[] bytes) {
this.bytes = bytes;
}
public ByteArrayWrapper(Byte[] bytesVal) {
bytes = new byte[bytesVal.length];
for (int i = 0; i < bytes.length; i++) {
bytes[i] = bytesVal[i];
}
}
@Override
public int hashCode() {
int hashCode = 1;
for (byte e : bytes) {
hashCode = 31 * hashCode + e;
}
return hashCode;
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof ByteArrayWrapper)) return false;
return Arrays.equals(bytes, ((ByteArrayWrapper) obj).bytes);
}
@Override
public String toString() {
return Arrays.toString(bytes);
}
}
public class WordEndChecker {
// 仅仅支持英文字符串的统计分析,这里应该尽可能地添加所有的单词分隔的符号
private char[] wordEndChars = {'\r', '\n', ' ', '\t', '.', ',', '!', '?', '\''
// , ' ', ',', '.', '?', '!'//不支持:全角符号,注意,全角字符占两个或三个字节,本程序只能按单字节进行截取
};
private byte[] wordEndBytes;
{
wordEndBytes = new byte[wordEndChars.length];
for (int i = 0; i < wordEndBytes.length; i++) {
wordEndBytes[i] = (byte) wordEndChars[i];
}
}
boolean isWordEnd(byte b) {
for (byte t : wordEndBytes) {
if (b == t) {
return true;
}
}
return false;
}
}
public static void main(String[] args) throws Exception {
WordAnalyzer analyzer = new WordAnalyzer();
try (InputStream ins = new FileInputStream("/home/conquer/Downloads/zzzzzzzzzzzzzzzzz/a.txt")) {
// try (InputStream ins = new ByteArrayInputStream(text.getBytes())) {
analyzer.analyzeInputStream(ins);
analyzer.doResult();
}
}
public void doResult() {
System.out.println("单词总数:" + resultMap.size());
// for (Map.Entry<ByteArrayWrapper, AtomicInteger> e : map.entrySet()) {
// // in bytes:按照字节打印出现的次数
// System.out.println(e.getKey() + "-" + e.getValue());
// // in String:将单词的字节转换为字符串,打印单词字符串出现的次数
// System.out.println(new String(e.getKey().bytes) + "-" + e.getValue());
// }
// 排序:
List<Map.Entry<ByteArrayWrapper, AtomicInteger>> list = new ArrayList(resultMap.entrySet());
Collections.sort(list, new Comparator<Map.Entry<ByteArrayWrapper, AtomicInteger>>() {
@Override
public int compare(Map.Entry<ByteArrayWrapper, AtomicInteger> o1, Map.Entry<ByteArrayWrapper, AtomicInteger> o2) {
return o2.getValue().get() - o1.getValue().get();
}
});
// 打印:
for (Map.Entry<ByteArrayWrapper, AtomicInteger> e : list) {
System.out.println(new String(e.getKey().bytes) + ": " + e.getValue());
}
}
public void analyzeInputStream(InputStream fis) throws Exception {
final WordEndChecker wordEndChecker = new WordEndChecker();
byte[] temp = new byte[1024];//缓冲区
List<Byte> listTemp = new ArrayList(20);//单词字节临时存储空间
for (int read; (read = fis.read(temp)) != -1; ) {
// System.out.println("读取到:" + read);
for (int i = 0; i < read; i++) {
byte cb = temp[i];
if (wordEndChecker.isWordEnd(cb)) {
flushlistTempToResultMap(listTemp);
} else {
listTemp.add(cb);
}
}
}
// 文章结尾可能无换行符号 \r 等,所以最后一个单词要手动添加进来
flushlistTempToResultMap(listTemp);
// System.out.println("分析完成!");
}
public void flushlistTempToResultMap(List<Byte> listTemp) {
if (listTemp.isEmpty()) {
return;
}
Byte[] toArray = listTemp.toArray(new Byte[listTemp.size()]);
ByteArrayWrapper byteArray = new ByteArrayWrapper(toArray);
AtomicInteger count = resultMap.get(byteArray);
if (count == null) {
count = new AtomicInteger(0);
resultMap.put(byteArray, count);
}
count.incrementAndGet();
listTemp.clear();
}
}版权声明:本文为conquer0715原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。