/*
//
// ADOBE SYSTEMS INCORPORATED
// Copyright (C) 2000-2003 Adobe Systems Incorporated
// All rights reserved.
//
// NOTICE: Adobe permits you to use, modify, and distribute this file
// in accordance with the terms of the Adobe license agreement
// accompanying it. If you have received this file from a source other
// than Adobe, then your use, modification, or distribution of it
// requires the prior written permission of Adobe.
//
*/
#include "TextExtractWorker.h"
#include "WatchFolder.h"
#include "stdio.h"
#ifdef UNIX_ENV
#include "string.h"
#endif
ThreadFuncReturnType GetWords(ThreadArgs *pArgs)
{
INIT_AUTO_POOL(autoReleasePool); /* Required only on MAC platform */
WatchFolder *theWF = pArgs->watchFolder;
while (1){
// we intitialise inside the loop to remove the risk of internal library leaks
// this has the effect of deleting all the thread specific PDFL data.
MyPDFLInit();
// first thing we do is get the path from the WatchFolder
char * fileToExtract;
fileToExtract = ( char *) theWF->getFile();
// we terminate the main loop if the watchFolder abstraction
// gives us a NULL token. This thread can now die.
if (fileToExtract ==
NULL) {
MyPDFLTerm();
return 0;
}
// we have a valid file to extract the text from.
// the destination file is filenmae.txt, in the parent directory.
// this code is intentionally simple, a real implementation would
// have a robust implementation.
string tmpstr(fileToExtract);
ASSize_t sentinal = tmpstr.rfind( '/' );
string filename = tmpstr.substr(sentinal,string::npos);
filename += ".txt" ;
tmpstr = tmpstr.substr(0,sentinal);
sentinal = tmpstr.rfind( '/' );
tmpstr = tmpstr.substr(0,sentinal);
tmpstr += filename;
char *pathnm = static_cast< char * > (
ASmalloc(tmpstr.length()+1));
sprintf_safe(pathnm,tmpstr.length()+1, "%s" ,tmpstr.c_str());
// At this point we now have a path with the parent's directory stripped off.
// Create the output file.
ASPathName outPath =
ASFileSysCreatePathFromDIPath(
NULL, pathnm,
NULL);
ASFile outFile;
ASFileSysOpenFile(
NULL,outPath,
ASFILE_WRITE |
ASFILE_CREATE, &outFile);
if (outFile ==
NULL){
fprintf(stderr, "Cannot open %s for writing\n" ,pathnm);
ASFileSysReleasePath(
NULL,outPath);
outPath =
NULL;
ASfree(pathnm);
pathnm =
NULL;
MyPDFLTerm();
return
NULL;
}
ASfree(pathnm);
pathnm =
NULL;
ASFileSysReleasePath(
NULL,outPath);
outPath =
NULL;
volatile
PDDoc docP =
NULL; // document we will be extracting text from
volatile
ASPathName filePath =
NULL;
DURING
char * newFilename =
NULL;
// need a local copy of our filename to get around constness.
newFilename = static_cast< char * > (
ASmalloc(strlen(fileToExtract)+1));
strcpy_safe(newFilename, strlen(fileToExtract)+1, fileToExtract);
DURING
filePath =
ASFileSysPathFromDIPath(
NULL,newFilename,
NULL);
HANDLER
ASfree(newFilename);
newFilename =
NULL;
ASRaise(
ASRegisterErrorString(ErrAlways, "Cannot get DI path" ));
END_HANDLER;
ASfree(newFilename);
newFilename =
NULL;
// Open the PDF file. If cannot be opened, raise
if ((docP =
PDDocOpen(filePath,
NULL,
NULL, true ))==
NULL) {
char buffer[400];
sprintf_safe(buffer, sizeof (buffer), "%s cannot open %s" ,pArgs->tName,fileToExtract);
ASRaise(
ASRegisterErrorString(ErrAlways,buffer));
}
ASFileSysReleasePath(
NULL, filePath );
filePath =
NULL;
// extract text after opening PDF
char xcbuffer[512];
sprintf_safe(xcbuffer, sizeof (xcbuffer), "%s opened %s\n" ,pArgs->tName,fileToExtract);
printf(xcbuffer);
ExtractDocText(docP, outFile);
sprintf_safe(xcbuffer, sizeof (xcbuffer), "%s has completed extraction.\n" ,pArgs->tName);
printf(xcbuffer);
// finish up
ASFileClose(outFile);
outFile =
NULL;
// Close the PDF file
PDDocClose(docP);
docP =
NULL;
HANDLER
char buf[200];
ASGetErrorString(
ERRORCODE, buf, sizeof (buf));
fprintf(stderr, buf);
if (filePath !=
NULL){
ASFileSysReleasePath(
NULL, filePath );
filePath =
NULL;
}
if (outFile !=
NULL){
ASFileClose(outFile);
outFile =
NULL;
}
if (docP !=
NULL){
PDDocClose(docP);
docP =
NULL;
}
END_HANDLER
MyPDFLTerm();
}
RELEASE_AUTO_POOL(autoReleasePool); /* Required only on MAC platform */
return 0;
}
void ExtractDocText(
PDDoc pdDoc,
ASFile outFile)
{
if (pdDoc==
NULL) {
return ;
}
// Set up WordFinder creation options record
PDWordFinderConfig wfConfig = static_cast<
PDWordFinderConfig > (
ASmalloc( sizeof (
PDWordFinderConfigRec)));
memset(wfConfig, 0, sizeof (
PDWordFinderConfigRec));
wfConfig->
recSize = sizeof (
PDWordFinderConfigRec);
wfConfig->
ignoreCharGaps = true ;
wfConfig->
ignoreLineGaps = false ;
wfConfig->
noAnnots = true ;
wfConfig->
noEncodingGuess = true ; // leave non-Roman single-byte font alone
// Std Roman treatment for custom encoding; overrides the noEncodingGuess option
wfConfig->
unknownToStdEnc = false ;
wfConfig->
disableTaggedPDF = false ; // legacy mode WordFinder creation
wfConfig->
noXYSort = false ;
wfConfig->
preserveSpaces = false ;
wfConfig->
noLigatureExp = false ;
wfConfig->
noHyphenDetection = false ;
wfConfig->
trustNBSpace = false ;
wfConfig->
noExtCharOffset = false ; // text extraction efficiency
wfConfig->
noStyleInfo = false ; // text extraction efficiency
wfConfig->
decomposeTbl =
NULL; // Unicode character replacement
wfConfig->
decomposeTblSize = 0;
wfConfig->
charTypeTbl =
NULL; // Custom char type table
wfConfig->
charTypeTblSize = 0;
if (ExtractText(pdDoc, 0 ,
PDDocGetNumPages(pdDoc) - 1, false , wfConfig, outFile)!= true ) {
fprintf(stderr, "Text extraction aborted." );
}
ASfree(wfConfig);
}
ACCB1
ASBool ACCB2 WordEnumProc(
PDWordFinder wfObj,
PDWord pdWord,
ASInt32 pgNum, void * clientData)
{
char str[128];
ASFile* pAsFile = (
ASFile*) clientData;
// get the word string
PDWordGetString(pdWord, str, sizeof (str));
// write string to file
if (
ASFileWrite(*pAsFile, str, strlen(str))!=strlen(str)){
printf( "Failed to write data to file" );
return false ;
}
// add a space after the string
char s[2]= " " ;
if (
ASFileWrite(*pAsFile, s, strlen(s))!=strlen(s)){
printf( "Failed to write data to file" );
return false ;
}
return true ;
}
bool ExtractText(
PDDoc pdDoc,
ASInt32 startPg,
ASInt32 endPg,
ASBool toUnicode,
PDWordFinderConfig pConfig,
ASFile outFile)
{
bool val = true ;
PDWordFinder pdWordFinder =
NULL;
DURING
pdWordFinder =
PDDocCreateWordFinderEx(pdDoc,
WF_LATEST_VERSION, toUnicode, pConfig);
for ( int i = startPg; i <= endPg; i++){
char buffer[150];
sprintf_safe(buffer, sizeof (buffer), "\r\nExtracting Page %d\r\n----------------\r\n" ,i);
if (
ASFileWrite(outFile, buffer, strlen(buffer))!=strlen(buffer)){
printf( "Failed to write data to file" );
E_RETURN( false );
}
PDWordFinderEnumWords(pdWordFinder, i, ASCallbackCreateProto(
PDWordProc, &WordEnumProc), &outFile);
}
PDWordFinderDestroy(pdWordFinder);
HANDLER
char buf[256], errmsg[256];
sprintf_safe(buf, sizeof (buf), "[ExtractText()]Error %d: %s" ,
ErrGetCode(
ERRORCODE),
ASGetErrorString(
ERRORCODE, errmsg, sizeof (errmsg)));
fprintf(stderr,buf);
if (pdWordFinder)
PDWordFinderDestroy(pdWordFinder);
val = false ;
END_HANDLER
return val;
}