perl 转换 docx 为 pdf

皮总 发布于 2012/02/28 21:53
阅读 1K+
收藏 1

1. 安装  office 2010

2. 安装 active perl 

#!/usr/bin/perl

# use strict;
use warnings;
use File::Basename;
use File::Copy;
use Win32::OLE;
use Win32::OLE::Const 'Microsoft Word';

my $queue_dir="queue_dir";
my $working_dir="working_dir";
my $repository_dir="repository_dir";
my $failure_dir="failure_dir";
my $success_dir="success_dir";
my $tmp_dir="tmp_dir";

@dir_list = (${queue_dir},${working_dir},${repository_dir},${failure_dir},${success_dir},${tmp_dir});

my $current_dir = dirname($0);
chdir ${current_dir};

foreach $dir_name (@dir_list){
	if ( ! -d $dir_name ){
		mkdir $dir_name;
	}
}

my $word = CreateObject Win32::OLE 'Word.Application' or die $!;
$word->{'Visible'} = 0;
$word->{'DisplayAlerts'} = wdAlertsNone;

sub docx2pdf
{
	print "RECEIVED : $_[0] \n";
	my $document = $word->Documents->Open("$_[0]");
	my $docxname = $document->Name;
	my $basename = basename($docxname, qw(.docx .doc));
	my $pdffilename = "${current_dir}\\$repository_dir\\$basename.pdf";
	$document -> {'ReadOnly'} = 0;
	$document -> saveas (
		{
			FileName=>$pdffilename,
			FileFormat=>wdExportFormatPDF
		}
	); 
	$document -> close ({SaveChanges=>wdDoNotSaveChanges});
	print "COMPLETED : $_[0] ...\n";
	sleep 5;
	move "$_[0]","${current_dir}\\${success_dir}\\$docxname";
}

sub scandir
{
	opendir DIR, "$_[0]" or die $!;
	local @filelist = readdir(DIR);
	closedir(DIR);
	foreach $file (@filelist) {
		if ($file !~ /\.$/ and $file !~ /\.\.$/){
			if ($file =~ /^[^~].+\.(doc|docx)$/i){
				return "$file";
			}
		}
	}
}

while ( 'True' ){
	$rs_queue_dir = &scandir( ${queue_dir} );
	if (!$rs_queue_dir){
		print "Sleeping ... \n";	
		sleep 5;
	} else {
		print "\nBegin : $rs_queue_dir ... \n";	
		&docx2pdf("${current_dir}\\${queue_dir}\\$rs_queue_dir");
	}
}
$word -> quit({SaveChanges=>wdDoNotSaveChanges});

以上代码在正常情况下能用, 但是如果 winword.exe 进程被关掉, 那么就不 work 了, 希望有人帮忙改进一下. 本人 perl 新手.

加载中
0
皮总
皮总

上面已经无法编辑了,改了一下, 下面这个目前满意 :

#!/usr/bin/perl

use strict;
use File::Basename;
use File::Copy;
use Archive::Zip qw( :ERROR_CODES );
use XML::Simple;
use POSIX qw(strftime);
use Win32::OLE;
use Win32::OLE::Const 'Microsoft Word';

my $queue_dir = "queue_dir";
my $repository_dir = "repository_dir";
my $failure_dir = "failure_dir";
my $success_dir = "success_dir";
my $tmp_dir = "tmp_dir";
my $current_dir = dirname($0);

chdir ${current_dir};

my (@dir_list, $dir_name);
@dir_list = (${queue_dir},${repository_dir},${failure_dir},${success_dir},${tmp_dir});
foreach $dir_name (@dir_list){
	if ( ! -d $dir_name ){
		mkdir $dir_name;
	}
}

sub clearproperties 
{
	my ($xml, $buffer, $temp_a, $temp_b, $docxbase );
	my $count = 0;
	my $zip = Archive::Zip->new();
	$docxbase = basename($_[0], qw(.docx .doc));
	unless ( $zip->read("${queue_dir}\\$_[0]") == AZ_OK )
	{
		&checkresult("$_[0]","$docxbase.pdf");
		return;
	};
	$buffer = $zip->contents( '_rels/.rels' );
	$xml = XML::Simple->new()->XMLin($buffer, KeepRoot => 1);
	$temp_a = $xml->{Relationships}->{Relationship};
	foreach $temp_b (@$temp_a){
		if ( $temp_b->{Target} =~ /custom.xml$/ ){
			delete $xml->{Relationships}->{Relationship}->[$count];
		}
		$count++;
	}
	$xml = XML::Simple->new()->XMLout($xml,NoIndent => 1, RootName => '',XMLDecl => '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>');
	$zip->contents( '_rels/.rels', "$xml" );
	$zip->overwrite();
	&docx2pdf("$_[0]","$docxbase.pdf");
}

sub docx2pdf
{
	my $complete_time = strftime("%Y-%m-%d %H:%M:%S", localtime(time));
	print "\n${complete_time} : RECEIVED $_[0] ...\n";
	my $docxname = "$_[0]";
	my $pdfname = "$_[1]";
	my $word = CreateObject Win32::OLE 'Word.Application' or die $!;
	$word -> {'Visible'} = 0;
	$word -> {'EnableEvents'} = 0;
	$word -> AddIns -> Unload;
	$word -> {'DisplayAlerts'} = wdAlertsNone;
	my $pdffile = "${current_dir}\\${repository_dir}\\$pdfname";
	my $document = $word->Documents->OpenNoRepairDialog("${current_dir}\\${queue_dir}\\$_[0]");
	$document -> {'ReadOnly'} = 1;
	$document -> ExportAsFixedFormat (
		{
			OutputFileName => $pdffile,
			ExportFormat => wdExportFormatPDF,
			OpenAfterExport => 0,
			OptimizeFor => wdExportOptimizeForPrint
		}
	);
	$document -> close ({SaveChanges=>wdDoNotSaveChanges});
	$word -> quit({SaveChanges=>wdDoNotSaveChanges});
	&checkresult("$docxname","$pdfname");
}

sub checkresult
{
	my $complete_time = strftime("%Y-%m-%d %H:%M:%S", localtime(time));
	if (open(MYFILE,">>convert.log")) {
		if (-e "$repository_dir\\$_[1]") {
			move "${queue_dir}\\$_[0]","${success_dir}\\$_[0]";
			print ("${complete_time} : COMPLETE $_[0] convert successful ...\n\n");
			print MYFILE ("${complete_time} $_[0] convert successful ...\n");
		} else {
			move "${queue_dir}\\$_[0]","${failure_dir}\\$_[0]";
			print ("${complete_time} : COMPLETE $_[0] convert failure ...\n\n");		
			print MYFILE ("${complete_time} $_[0] convert failure ...\n");		
		}
	}
	close(MYFILE);
}

sub clearold
{
	my ($cleandir, $cleanfile);
	my $cleandir = "$_[0]";
	my $timestamp = time();
	opendir CLEAN, $cleandir;
	for $cleanfile (readdir CLEAN)
	{
		($cleanfile eq '.' or $cleanfile eq '..') and next;
		if ($timestamp - (stat("$cleandir\\$cleanfile"))[9] > 86400)
		{
		   unlink "$cleandir\\$cleanfile";
		}
	}
	closedir CLEAN;
}

sub scandir
{
	my (@filelist,$file);
	opendir DIR, "$_[0]" or die $!;
	@filelist = readdir(DIR);
	closedir(DIR);
	foreach $file (@filelist) {
		if ($file !~ /\.$/ and $file !~ /\.\.$/){
			if ($file =~ /^[^~].*\.docx$/i){
				return "$file";
			}
		}
	}
}

while ( 'True' ){
	my $rs_queue_file = &scandir( ${queue_dir} );
	if (!$rs_queue_file){
		print "Sleeping ... \n";	
		sleep 5;
	} else {
		&clearproperties("${rs_queue_file}");
		&clearold("${repository_dir}");
		&clearold("${success_dir}");
	}
}

0
RickyFeng
RickyFeng
用pdfFactory转不是更好更方便吗
皮总
皮总
@RickyFeng : 你的方法不行 , 我上面调用 saveas 的方法也不行, 我的情况很特殊的, 文档是通过模板生成的, 模板是程序来读 webservices 来生成的 , 哎悲剧 ....
RickyFeng
RickyFeng
@皮总 pdffactory是专业做转pdf的,我们公司用了10多年了,你试下先
皮总
皮总
@RickyFeng 我用过 http://www.primopdf.com/ , 但是没法自动, 用脚本可以不退出扫描到文件就开始转换
RickyFeng
RickyFeng
@皮总 你用过没有
皮总
皮总
我这的情况是每天可能有好几百个要转换的 ... 量多
下一页
0
皮总
皮总
恶意顶一下
0
皮总
皮总

上面已经无法编辑了,改了一下 :

#!/usr/bin/perl

use autodie;
use Try::Tiny;
use File::Basename;
use File::Copy;
use Win32::OLE;
use POSIX qw(strftime);
use Win32::OLE::Const 'Microsoft Word';

my $queue_dir="queue_dir";
my $repository_dir="repository_dir";
my $failure_dir="failure_dir";
my $success_dir="success_dir";
my $tmp_dir="tmp_dir";
my $current_dir = dirname($0);

chdir ${current_dir};

@dir_list = (${queue_dir},${repository_dir},${failure_dir},${success_dir},${tmp_dir});
foreach $dir_name (@dir_list){
	if ( ! -d $dir_name ){
		mkdir $dir_name;
	}
}

my $word = CreateObject Win32::OLE 'Word.Application' or die $!;
$word->{'Visible'} = 0;
$word->{'DisplayAlerts'} = wdAlertsNone;

sub docx2pdf
{
	$complete_time = strftime("%Y-%m-%d %H:%M:%S", localtime(time));
	print "\n${complete_time} : RECEIVED $_[0] ...\n";
	my $docxname = "$_[0]";
	my $docxbase = basename($docxname, qw(.docx .doc));
	my $pdffile = "${current_dir}\\${repository_dir}\\$docxbase.pdf";
	$document = $word->Documents->Open("${current_dir}\\${queue_dir}\\$_[0]");
	$document -> {'ReadOnly'} = 0;
	$document -> saveas (
		{
			FileName=>$pdffile,
			FileFormat=>wdExportFormatPDF
		}
	);
	$document -> close ({SaveChanges=>wdDoNotSaveChanges});
	# $word -> quit({SaveChanges=>wdDoNotSaveChanges});
	&checkresult("$docxname","$docxbase.pdf");
}

sub checkresult
{
	$complete_time = strftime("%Y-%m-%d %H:%M:%S", localtime(time));
	if (open(MYFILE,">>convert.log")) {
		if (-e "$repository_dir\\$_[1]") {
			move "${queue_dir}\\$_[0]","${success_dir}\\$_[0]";
			print ("${complete_time} : COMPLETE $_[0] convert successful ...\n\n");
			print MYFILE ("${complete_time} $_[0] convert successful ...\n");
		} else {
			move "${queue_dir}\\$_[0]","${failure_dir}\\$_[0]";
			print ("${complete_time} : COMPLETE $_[0] convert failure ...\n\n");		
			print MYFILE ("${complete_time} $_[0] convert failure ...\n");		
		}
	}
	close(MYFILE);
}

sub scandir
{
	opendir DIR, "$_[0]" or die $!;
	local @filelist = readdir(DIR);
	closedir(DIR);
	foreach $file (@filelist) {
		if ($file !~ /\.$/ and $file !~ /\.\.$/){
			if ($file =~ /^[^~].+\.(doc|docx)$/i){
				return "$file";
			}
		}
	}
}

while ( 'True' ){
	$rs_queue_file = &scandir( ${queue_dir} );
	if (!$rs_queue_file){
		print "Sleeping ... \n";	
		sleep 5;
	} else {
		&docx2pdf("$rs_queue_file");
	}
}

 

有 2 个问题就是:

1. 别人拿个 非 docx 格式的文件改成 .docx 后辍上面就不 work 了

2. 如果 word 进程意外挂了,上面也不 work 了

希望有人帮忙改进一下.

求助 @红薯

返回顶部
顶部