word和.txt文件转html 及pdf文件, 使用poi jsoup itext心得本人第一次写博客,有上面不足的或者需要改正的希望大家指出来,一起学习交流讨论。
由于在项目中遇到了这一个问题,在网上也找了很多方法,感觉千篇一律,总有一些问题,因此总结出word转html和pdf文件使用方法。
虽然poi功能不是很强大,但毕竟不依靠本地office软件,同样还有一种方式使用jacob也可以将word转html,不过这个方式要依靠本地office,
而且只能在windows平台下,不支持unix系统。jacob使用起来还是比较简单的,如果大家需要jacob的使用方法,我会分享给大家。
关于.txt文件转html,就是使用io操作将.txt文件读取出来然后写入到html中,也不需要额外的jar包。
注意:使用poi需要注意以下几项,由于我在做这个功能的时候没有注意这个问题的存在,一直找不出原因,还请有关大牛门指正一下为什么?
1
.使用office的文档.doc和.docx格式的都没有问题,但使用wps生成的word文档时,只能转.doc格式的文件,对.docx的文档转出后没有图片,得不到img属性。
2
.在使用word文档转pdf格式的文件时,生成的pdf没有中文,对中文显示不是很支持。
3
.在将word转成pdf时,需要把生成的html文件转化成标准的html文件,不然解析后会出现<meta>或者<img>标签不闭合的情况。
4
.使用的jar包如下,都可以在maven中央仓库下载得到。下面就直接附上代码了,希望大家有什么问题在下面评论互相交流和学习,使用时直接调用方法即可。
如果大家觉得可以请点一个赞,谢谢大家。
package
com.kqco.tools;
import
org.apache.poi.hwpf.HWPFDocument;
import
org.apache.poi.hwpf.converter.PicturesManager;
import
org.apache.poi.hwpf.converter.WordToHtmlConverter;
import
org.apache.poi.hwpf.usermodel.PictureType;
import
org.apache.poi.xwpf.converter.core.BasicURIResolver;
import
org.apache.poi.xwpf.converter.core.FileImageExtractor;
import
org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import
org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import
org.apache.poi.xwpf.usermodel.XWPFDocument;
import
org.jsoup.Jsoup;
import
org.w3c.dom.Document;
import
org.w3c.tidy.Tidy;
import
org.xhtmlrenderer.pdf.ITextFontResolver;
import
org.xhtmlrenderer.pdf.ITextRenderer;
import
com.lowagie.text.pdf.BaseFont;
import
javax.xml.parsers.DocumentBuilderFactory;
import
javax.xml.transform.OutputKeys;
import
javax.xml.transform.Transformer;
import
javax.xml.transform.TransformerFactory;
import
javax.xml.transform.dom.DOMSource;
import
javax.xml.transform.stream.StreamResult;
import
java.io.BufferedInputStream;
import
java.io.BufferedOutputStream;
import
java.io.BufferedReader;
import
java.io.BufferedWriter;
import
java.io.ByteArrayInputStream;
import
java.io.ByteArrayOutputStream;
import
java.io.DataOutputStream;
import
java.io.File;
import
java.io.FileInputStream;
import
java.io.FileOutputStream;
import
java.io.IOException;
import
java.io.InputStreamReader;
import
java.io.OutputStream;
import
java.io.OutputStreamWriter;
import
java.io.PrintWriter;
import
java.nio.file.Path;
import
java.nio.file.Paths;
public
class
FileConverter {
public
void
wordToHtml(String sourceFilePath, String targetFilePosition)
throws
Exception {
if
(
".docx"
.equals(sourceFilePath.substring(sourceFilePath.lastIndexOf(
"."
, sourceFilePath.length())))) {
docxToHtml(sourceFilePath, targetFilePosition);
}
else
if
(
".doc"
.equals(sourceFilePath.substring(sourceFilePath.lastIndexOf(
"."
, sourceFilePath.length())))) {
docToHtml(sourceFilePath, targetFilePosition);
}
else
{
throw
new
RuntimeException(
"文件格式不正确"
);
}
}
private
void
docToHtml(String sourceFilePath, String targetFilePosition)
throws
Exception {
final
Path imagePath = Paths.get(targetFilePosition).getParent().resolve(
"image"
);
HWPFDocument wordDocument =
new
HWPFDocument(
new
FileInputStream(sourceFilePath));
Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
WordToHtmlConverter wordToHtmlConverter =
new
WordToHtmlConverter(document);
wordToHtmlConverter.setPicturesManager(
new
PicturesManager() {
@Override
public
String savePicture(
byte
[] content, PictureType pictureType, String name,
float
width,
float
height) {
try
(FileOutputStream out =
new
FileOutputStream(imagePath.resolve(name).toString())) {
out.write(content);
}
catch
(Exception e) {
e.printStackTrace();
}
return
"../tmp/image/"
+ name;
}
});
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
DOMSource domSource =
new
DOMSource(htmlDocument);
StreamResult streamResult =
new
StreamResult(
new
File(targetFilePosition));
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING,
"UTF-8"
);
serializer.setOutputProperty(OutputKeys.INDENT,
"yes"
);
serializer.setOutputProperty(OutputKeys.METHOD,
"html"
);
serializer.transform(domSource, streamResult);
}
private
void
docxToHtml(String sourceFilePath, String targetFileName)
throws
Exception {
String imagePathStr = Paths.get(targetFileName).getParent().resolve(
"../tmp/image/word/media"
).toString();
OutputStreamWriter outputStreamWriter =
null
;
try
{
XWPFDocument document =
new
XWPFDocument(
new
FileInputStream(sourceFilePath));
XHTMLOptions options = XHTMLOptions.create();
options.setExtractor(
new
FileImageExtractor(
new
File(imagePathStr)));
options.URIResolver(
new
BasicURIResolver(
"../tmp/image/word/media"
));
outputStreamWriter =
new
OutputStreamWriter(
new
FileOutputStream(targetFileName),
"UTF-8"
);
XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance();
xhtmlConverter.convert(document, outputStreamWriter, options);
}
finally
{
if
(outputStreamWriter !=
null
) {
outputStreamWriter.close();
}
}
}
public
void
txtToHtml(String filePath, String htmlPosition) {
try
{
String encoding =
"GBK"
;
File file =
new
File(filePath);
if
(file.isFile() && file.exists()) {
InputStreamReader read =
new
InputStreamReader(
new
FileInputStream(file), encoding);
BufferedReader bufferedReader =
new
BufferedReader(read);
FileOutputStream fos =
new
FileOutputStream(
new
File(htmlPosition));
OutputStreamWriter osw =
new
OutputStreamWriter(fos,
"UTF-8"
);
BufferedWriter bw =
new
BufferedWriter(osw);
String lineTxt =
null
;
while
((lineTxt = bufferedReader.readLine()) !=
null
) {
bw.write(lineTxt +
"</br>"
);
}
bw.close();
osw.close();
fos.close();
read.close();
}
else
{
System.out.println(
"找不到指定的文件"
);
}
}
catch
(Exception e) {
System.out.println(
"读取文件内容出错"
);
e.printStackTrace();
}
}
public
void
changeImageUrl(String sourceFilePath,String targetFilePosition)
throws
IOException {
FileInputStream fis =
new
FileInputStream(sourceFilePath);
BufferedInputStream bufis =
new
BufferedInputStream(fis);
FileOutputStream fos =
new
FileOutputStream(targetFilePosition);
BufferedOutputStream bufos =
new
BufferedOutputStream(fos);
int
len =
0
;
while
((len = bufis.read()) != -
1
) {
bufos.write(len);
}
bufis.close();
bufos.close();
}
private
boolean
parseToXhtml(String f_in, String outfile) {
boolean
bo =
false
;
ByteArrayOutputStream tidyOutStream =
null
;
FileInputStream fis =
null
;
ByteArrayOutputStream bos =
null
;
ByteArrayInputStream stream =
null
;
DataOutputStream to =
null
;
try
{
fis =
new
FileInputStream(f_in);
bos =
new
ByteArrayOutputStream();
int
ch;
while
((ch = fis.read()) != -
1
) {
bos.write(ch);
}
byte
[] bs = bos.toByteArray();
bos.close();
String hope_gb2312 =
new
String(bs,
"gb2312"
);
byte
[] hope_b = hope_gb2312.getBytes();
String basil =
new
String(hope_b,
"gb2312"
);
stream =
new
ByteArrayInputStream(basil.getBytes());
tidyOutStream =
new
ByteArrayOutputStream();
Tidy tidy =
new
Tidy();
tidy.setInputEncoding(
"gb2312"
);
tidy.setQuiet(
true
);
tidy.setOutputEncoding(
"UTF-8"
);
tidy.setShowWarnings(
true
);
tidy.setIndentContent(
true
);
tidy.setSmartIndent(
true
);
tidy.setIndentAttributes(
false
);
tidy.setWraplen(
1024
);
tidy.setXHTML(
true
);
tidy.setErrout(
new
PrintWriter(System.out));
tidy.parse(stream, tidyOutStream);
to =
new
DataOutputStream(
new
FileOutputStream(outfile));
tidyOutStream.writeTo(to);
bo =
true
;
}
catch
(Exception ex) {
System.out.println(ex.toString());
ex.printStackTrace();
return
bo;
}
finally
{
try
{
if
(to !=
null
) {
to.close();
}
if
(stream !=
null
) {
stream.close();
}
if
(fis !=
null
) {
fis.close();
}
if
(bos !=
null
) {
bos.close();
}
if
(tidyOutStream !=
null
) {
tidyOutStream.close();
}
}
catch
(IOException e) {
e.printStackTrace();
}
System.gc();
}
return
bo;
}
private
boolean
convertHtmlToPdf(String inputFile, String outputFile)
throws
Exception {
OutputStream os =
new
FileOutputStream(outputFile);
ITextRenderer renderer =
new
ITextRenderer();
String url =
new
File(inputFile).toURI().toURL().toString();
renderer.setDocument(url);
ITextFontResolver fontResolver = renderer.getFontResolver();
fontResolver.addFont(
"C:/Windows/Fonts/simsun.ttc"
, BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
renderer.getSharedContext().setBaseURL(
"imagePath"
);
renderer.layout();
renderer.createPDF(os);
os.flush();
os.close();
return
true
;
}
private
static
void
standardHTML(String targetHtml)
throws
IOException {
File f =
new
File(targetHtml);
org.jsoup.nodes.Document doc = Jsoup.parse(f,
"UTF-8"
);
doc.select(
"meta"
).removeAttr(
"name"
);
doc.select(
"meta"
).attr(
"content"
,
"text/html; charset=UTF-8"
);
doc.select(
"meta"
).attr(
"http-equiv"
,
"Content-Type"
);
doc.select(
"meta"
).html(
" "
);
doc.select(
"img"
).html(
" "
);
doc.select(
"style"
).attr(
"mce_bogus"
,
"1"
);
doc.select(
"body"
).attr(
"font-family"
,
"SimSun"
);
doc.select(
"html"
).before(
"<?xml version='1.0' encoding='UTF-8'>"
);
FileOutputStream fos =
new
FileOutputStream(f,
false
);
OutputStreamWriter osw =
new
OutputStreamWriter(fos,
"UTF-8"
);
osw.write(doc.html());
System.out.println(doc.html());
osw.close();
}
}