python遍历文件内容_python遍历文件进行数据处理

背景

之前写过一个遍历文件夹进行处理的Python程序,但因为时间太久找不着了。。 导致只能自己再写一遍,于是决定将代码放置于博客之中,以便以后使用。

#!usr/bin/env python

#-*- coding:utf-8 -*-

import math

import os

import glob

import numpy as np

import jieba

import string

import jieba.analyse

def read_from_file(directions):

decode_set=['utf-8','gb18030','ISO-8859-2','gb2312','gbk','Error']#编码集

#编码集循环

for k in decode_set:

try:

file = open(directions,"r",encoding=k)

readfile = file.read()#这步如果解码失败就会引起错误,跳到except。

#print("open file %s with encoding %s" %(directions,k))#打印读取成功

#readfile = readfile.encode(encoding="utf-8",errors="replace")#若是混合编码则将不可编码的字符替换为"?"。

file.close()

break#打开路径成功跳出编码匹配

except:

if k=="Error":#如果碰到这个程序终止运行

raise Exception("%s had no way to decode"%directions)

continue

return readfile

filenames = []

filenames=glob.glob(r"C:/Users/Administrator/Documents/Tencent Files/9/FileRecv/TXT/*.txt")

filenameslen=len(filenames)

count=0

for filename in filenames:

print("%d : %d" %(count,filenameslen))

names=filename.find('TXT')+4

namee=filename.find('.txt')

f=open(filename,"rb")

content=f.readlines()

content=" ".join('%s' %id for id in content)

start=content.find('description')+16

overflow=content.find('comments')

end=content[start:].find('#')+start

if end>=overflow:

end=overflow

file = open(r"C:/Users/Administrator/Documents/Tencent Files/9/FileRecv/TXT/" +filename[names:namee]+'keyword' + '.txt','w')

file_data = content[start:end]

#基于TF-IDF算法进行关键词抽取

tfidf=jieba.analyse.extract_tags

keywords=tfidf(file_data)

for i in range(len(keywords)):

if len(keywords)<=0:

print("error,please check your input")

break

file.write(keywords[i]+'\n')

file.close()

count=count+1

print("%d : %d" %(count,filenameslen))

print("finished")