A. python如何读取word文件
>>>defPrintAllParagraphs(doc):
count=doc.Paragraphs.Count
foriinrange(count-1,-1,-1):
pr=doc.Paragraphs[i].Range
printpr.Text
>>>app=my.Office.Word.GetInstance()
>>>doc=app.Documents[0]
>>>PrintAllParagraphs(doc)
1.什么是域
域应用基础
>>>
@staticmethod
defGetInstance():
u'''获取Word应用程序的Application对象'''
importwin32com.client
returnwin32com.client.Dispatch('Word.Application')
my.Office.Word.GetInstance的方法实现如上,是一个使用win32com操纵Word Com的接口的封装
所有Paragraph即段落对象,都是通过Paragraph.Range.Text来访问它的文字的
B. python读取word文档内容
import fnmatch, os, sys, win32com.client
readpath=r'D:\123'
wordapp = win32com.client.gencache.EnsureDispatch("Word.Application")
try:
for path, dirs, files in os.walk(readpath):
for filename in files:
if not fnmatch.fnmatch(filename, '*.docx'):continue
doc = os.path.abspath(os.path.join(path,filename))
print 'processing %s...' % doc
wordapp.Documents.Open(doc)
docastext = doc[:-4] + 'txt'
wordapp.ActiveDocument.SaveAs(docastext,FileFormat=win32com.client.constants.wdFormatText)
wordapp.ActiveDocument.Close()
finally:
wordapp.Quit()
print 'end'
f=open(r'd:\123\test.txt','r')
for line in f.readlines():
print line.decode('gbk')
f.close()
C. 如何在 Linux 上使用 Python 读取 word 文件信息
第一步:获取doc文件的xml组成文件
import zipfiledef get_word_xml(docx_filename):
with open(docx_filename) as f:
zip = zipfile.ZipFile(f)
xml_content = zip.read('word/document.xml')
return xml_content
第二步:解析xml为树形数据结构
from lxml import etreedef get_xml_tree(xml_string):
return etree.fromstring(xml_string)
第三步:读取word内容:
def _itertext(self, my_etree):
"""Iterator to go through xml tree's text nodes"""
for node in my_etree.iter(tag=etree.Element):
if self._check_element_is(node, 't'):
yield (node, node.text)def _check_element_is(self, element, type_char):
word_schema = '99999'
return element.tag == '{%s}%s' % (word_schema,type_char)
D. word文字替换批处理之python
媳妇有无数word文档要替换,网络后发现没有现成的方法。
google后没有太合适的。抄抄写写弄个python脚本换目录下所有word内容,共勉之。
import os
from docx import Document
# 放了一些docx 文件
files_dict ={
"/home/test/a/医疗器械临床试验第一版-设计/": "/home/test/a/医疗器械临床试验第一版-设计/",
"/home/test/a/医疗器械临床试验第一版-管理制度/": "/home/test/a/医疗器械临床试验第一版-管理制度/",
"/home/test/a/医疗器械临床试验第一版-SOP/": "/home/test/a/医疗器械临床试验第一版-SOP/",
"/home/test/a/目录/": "/home/test/a/目录/"
}
replace_dict = {
"XXGNK":"XZDXGWK",
"心血管专业": "心脏大血管外科",
"心血管":"心脏大血管外科",
}
def check_and_change(document, replace_dict):
"""
遍历word中的所有 paragraphs,在每一段中发现含有key 的内容,就替换为 value 。
(key 和 value 都是replace_dict中的键值对。)
"""
for para in document.paragraphs:
for i in range(len(para.runs)):
for key, value in replace_dict.items():
if key in para.runs[i].text:
print(key+"-->"+value)
para.runs[i].text = para.runs[i].text.replace(key, value)
for table in document.tables:
for row in table.rows:
for cell in row.cells:
for para in cell.paragraphs:
for i in range(len(para.runs)):
for key, value in replace_dict.items():
if key in para.runs[i].text:
print(key+"-->"+value)
para.runs[i].text = para.runs[i].text.replace(key, value)
return document
def main():
for old_file_path, new_file_path in files_dict.items():
for name in os.listdir(old_file_path):
print(name)
old_file = old_file_path + name
new_file = new_file_path + name
if old_file.split(".")[1] == 'docx':
document = Document(old_file)
document = check_and_change(document, replace_dict)
document.save(new_file)
print("^"*30)
if __name__ == '__main__':
main()
E. 如何用python读取word
使用Python的内部方法open()读取文本文件
try:
f=open('/file','r')
print(f.read())
finally:
iff:
f.close()
如果读取word文档推荐使用第三方插件,python-docx 可以在官网上下载
使用方式
#-*-coding:cp936-*-
importdocx
document=docx.Document(文件路径)
docText=' '.join([
paragraph.text.encode('utf-8')forparagraphindocument.paragraphs
])
printdocText
F. python如何读取word文件中的文本内容并写入到新的txt文件