Create CSV from Formdata of PDFs in a folder (using PDFtk)

Shared Libraries
Forum rules
For sharing working examples of macros / scripts. These can be in any script language supported by OpenOffice.org [Basic, Python, Netbean] or as source code files in Java or C# even - but requires the actual source code listing. This section is not for asking questions about writing your own macros.
Post Reply
musikai
Volunteer
Posts: 294
Joined: Wed Nov 11, 2015 12:19 am

Create CSV from Formdata of PDFs in a folder (using PDFtk)

Post by musikai »

With PDFtk installed this will extract all formfield data of all PDFs in a chosen folder and create a CSV.
This is a chained version of these seperate codes:
Extract form data of Folder of PDFs (using PDFtk)
viewtopic.php?f=21&t=91586
Convert Folder of Formdata.txt (created with PDFtk) to .xfdf
viewtopic.php?f=21&t=91587
XFDF to CSV-Converter
viewtopic.php?f=21&t=91524
Calc: Replace "\n" with linebreaks
viewtopic.php?f=21&t=91651

At the End a "merged.csv" is created in the chosen Folder and automatically opened in Calc.

Code: Select all

Sub Folder_of_PDFs_to_CSV

    If (Not GlobalScope.BasicLibraries.isLibraryLoaded("Tools")) Then GlobalScope.BasicLibraries.LoadLibrary("Tools")
    oSFA = createUnoService("com.sun.star.ucb.SimpleFileAccess")
    
    sFolderpath=""
    rem---folderpicker
    rem to not use the folderpicker set the folder here and uncomment:
    rem sFolderpath=converttourl("C:\Users\kai\Desktop")
    if sFolderpath="" then
    oDialog = CreateUnoService("com.sun.star.ui.dialogs.FolderPicker")

    If oDialog.Execute() = 1 Then
    sFolderpath = oDialog.getDirectory
    else
    exit sub
    end if

    end if
    if not oSFA.Exists(sFolderpath) then
    msgbox "Folder not found!"
    exit sub
    end if

	rem-----if file is opened in another application
	if fileexists(sFolderpath & "/" & "merged.csv") then
	On Error goto Fileaccesserror 
	proceed:
	kill(sFolderpath & "/" & "merged.csv")
	end if


	rem oProgressBar = ThisComponent.CurrentController.getStatusIndicator()
	rem oProgressBar.start( "Creating CSV", 4 )
	rem oProgressBar.setValue( 1 )
	PDFTK_dumpdatafields_of_Folder_with_PDFs_to_txt_files(sFolderpath)
	rem oProgressBar.setValue( 2 )
	Convert_Folder_of_pdftk_dumpdatafieldsTXTs_to_XFDFs(sFolderpath)
	rem oProgressBar.setValue( 3 )
    Convert_multipleXFDFs_to_1CSV(sFolderpath)
	rem oProgressBar.setValue( 4 )
    calc_replaceLinebreak_UsedArea

	rem oProgressBar.end

	exit sub
	Fileaccesserror:
	closemessage = """" & convertfromurl(sFolderpath & "/" & "merged.csv") & """ seems to be opened in another application."  & chr(10) & chr(10) & "Please close the file!"
	msgbox(closemessage,0)
	rem resume proceed 
End sub


    Sub PDFTK_dumpdatafields_of_Folder_with_PDFs_to_txt_files(sFolderpath)

	pdftkapp="pdftk"

    sFileName=""
    sFileName = Dir(sFolderpath & "/", 0)
    Do While (sFileName <> "")
    if GetFileNameExtension(sFileName)="pdf" then
    Fileurl=sFolderpath & "/" & sFileName
	inputfile=convertfromurl(Fileurl)

	mkdir(sFolderpath & "/" & "PDFtk-DataFields")
	rem indirect way because of PDFtk utf8 filename bug
	outputfiledummy = sFolderpath & "/" & "PDFtk-DataFields" & "/" & "datafields.txt"
	outputfile = sFolderpath & "/" & "PDFtk-DataFields" & "/" & getfilenamewithoutextension(sFileName) & ".txt"
    if fileexists(outputfiledummy) then kill(outputfiledummy) 
    outputfiledummy=convertfromurl(outputfiledummy) 
    if fileexists(outputfile) then kill(outputfile) 
    Shell(pdftkapp,0, """" & inputfile & """" & " dump_data_fields_utf8 output " & """" & outputfiledummy & """", true)
    rem indirect way because of PDFtk utf8 filename bug
    
		if fileexists(outputfiledummy) then 
		Name outputfiledummy as outputfile
		end if
	end if
    sFileName = Dir()
    loop

    rem msgbox "Files saved to: " & convertfromurl(sFolderpath & "/" & "PDFtk-DataFields")

    End Sub
    
    
    Sub Convert_Folder_of_pdftk_dumpdatafieldsTXTs_to_XFDFs(sFolderpath)
	oSFA = createUnoService("com.sun.star.ucb.SimpleFileAccess")
	
	sFolderpath2 = sFolderpath & "/" & "PDFtk-DataFields"

    sFileName=""
    sFileName = Dir(sFolderpath2 & "/", 0)
    Do While (sFileName <> "")
    if GetFileNameExtension(sFileName)="txt" then
    Fileurl=sFolderpath2 & "/" & sFileName

	oTextinputStream = CreateUnoService("com.sun.star.io.TextInputStream")
	inputfile = converttourl(Fileurl)
	oinputStream = oSFA.openFileRead(inputfile)
	oTextinputStream.setInputStream(oinputStream)
	
	mkdir(sFolderpath2 & "/" & "XFDF")
	outputfile = converttourl(sFolderpath2 & "/" & "XFDF" & "/" & getfilenamewithoutextension(sFileName) & ".xfdf")
    if fileexists(outputfile) then kill(outputfile)   
    
    oTextoutputStream = CreateUnoService("com.sun.star.io.TextOutputStream")
    ooutputStream = oSFA.openFileWrite(outputfile)
    oTextoutputStream.setOutputStream(ooutputStream)
	
	oTextoutputStream.writestring("<?xml version=""1.0"" encoding=""UTF-8""?>" & chr(10) & "<xfdf xmlns=""http://ns.adobe.com/xfdf/"" xml:space=""preserve"">" & chr(10))
	rem oTextoutputStream.writestring("<f href=""" & convertfromurl(inputfile) & """/>" & chr(10))	
	oTextoutputStream.writestring("<fields>" & chr(10))
	
	
	foundFieldName=0
	foundFieldValue=0

	do while oTextinputStream.isEOF()=0
	stringA = oTextinputStream.readLine()
	if instr(stringA,"Field")=0 and instr(stringA,": ")=0 and foundFieldValue=1 then
	FieldValue = FieldValue & chr(10) & stringA
	elseif foundFieldValue=1 then
	oTextoutputStream.writestring("<value>" & FieldValue & "</value>" & chr(10))
	foundFieldValue=0
	end if
	if instr(stringA,"FieldName: ")=1 then 
	FieldName=mid(stringA,len("FieldName: ")+1)
	foundFieldName=1
	oTextoutputStream.writestring("<field name=""" & FieldName & """>" & chr(10))
	end if
	if instr(stringA,"FieldValue: ")=1 then 
	FieldValue=mid(stringA,len("FieldValue: ")+1)
	foundFieldValue=1
	end if
	
	if instr(stringA,"---")=1 and foundFieldName then 
	oTextoutputStream.writestring("</field>" & chr(10))
	foundFieldName=0
	end if
	
	loop
	
	if foundFieldName then oTextoutputStream.writestring("</field>" & chr(10))
	oTextoutputStream.writestring("</fields>" & chr(10) & "</xfdf>" )
       
	oTextinputStream.closeInput
	oTextoutputStream.closeOutput()
    end if
    sFileName = Dir()
    loop
 
    End Sub


    
    Sub Convert_multipleXFDFs_to_1CSV(sFolderpath)
    
    sFolderpathOrg=sFolderpath
    sFolderpath=sFolderpath & "/" & "PDFtk-DataFields" & "/" & "XFDF"

    dim allXFDFHeaders(0)
    dim allXFDFValues()
    sFileName=""
    sFileName = Dir(sFolderpath & "/", 0)
    Do While (sFileName <> "")
    if GetFileNameExtension(sFileName)="xfdf" then
    Fileurl=sFolderpath & "/" & sFileName

       oSFA = createUnoService("com.sun.star.ucb.SimpleFileAccess")
       oInStream = oSFA.openFileRead(Fileurl)
       oDOM = createUnoService("com.sun.star.xml.dom.DocumentBuilder")
       oXML = oDOM.parse(oInStream)
       oInStream.closeInput
       
       nodeListFields = oXML.getElementsByTagName("fields").item(0).getElementsByTagName("field")

       for i=0 to nodeListFields.length-1
          curFieldNode = nodeListFields.item(i)
       If curFieldNode.hasAttributes Then
       nodeAttributes = curFieldNode.Attributes
          For k=0 To nodeAttributes.Length - 1
                additem=1
                 for j=0 to ubound(allXFDFHeaders)
              if allXFDFHeaders(j)=convertXMLstring(nodeAttributes.item(k).Value) then additem=0
               next j
             
            if additem=1 then
            redim preserve allXFDFHeaders(ubound(allXFDFHeaders)+1)
          allXFDFHeaders(ubound(allXFDFHeaders))=convertXMLstring(nodeAttributes.item(k).Value)
          end if

          Next k
       end if
         
       next i
    end if
    sFileName = Dir()
    loop
    k=0
    i=0
    j=0
       

    sFileName = Dir(sFolderpath & "/", 0)
    Do While (sFileName <> "")
    if GetFileNameExtension(sFileName)="xfdf" then
    Fileurl=sFolderpath & "/" & sFileName

       oSFA = createUnoService("com.sun.star.ucb.SimpleFileAccess")
       oInStream = oSFA.openFileRead(Fileurl)
       oDOM = createUnoService("com.sun.star.xml.dom.DocumentBuilder")
       oXML = oDOM.parse(oInStream)
       oInStream.closeInput
       
       nodeListFields = oXML.getElementsByTagName("fields").item(0).getElementsByTagName("field")

       FieldNameResult=""
       FieldValueResult=""
       
       for i=0 to nodeListFields.length-1
          curFieldNode = nodeListFields.item(i)
       If curFieldNode.hasAttributes Then
       nodeAttributes = curFieldNode.Attributes
          For k=0 To nodeAttributes.Length - 1
          FieldNameResult = FieldNameResult & chr(9) & convertXMLstring(nodeAttributes.item(k).Value)
          Next k
       end if
       
       childcounter=1
       sNodeValue=""
       sNodeValuetemp=""
       If curFieldNode.hasChildNodes Then
       curChildNode = curFieldNode.firstChild
       
       Do While NOT isNull(curChildNode)
       
          If curChildNode.LocalName = "value" then
             if curChildNode.hasChildNodes then
             curChildChildNode=curChildNode.firstChild                      
   	         sNodeValuetemp= curChildChildNode.NodeValue
    	     sNodeValuetemp= replace(sNodeValuetemp,chr(10), "\n")
    	     sNodeValuetemp= replace(sNodeValuetemp,";", ",")         	         
             else
             sNodeValuetemp=""
             end if
           
             if childcounter=1 then
    	     sNodeValue=sNodeValuetemp
    	     else
    	     sNodeValue = sNodeValue & "\n" & sNodeValuetemp
    	     end if
    	     
             childcounter=childcounter+1
         
          end if
       curChildNode = curChildNode.nextSibling
       
       loop
       FieldValueResult = FieldValueResult & chr(9) & sNodeValue
       end if
       
       next i
       m=0
       n=0
        XFDFHeaders = split(FieldNameResult,chr(9))
        XFDFValues = split(FieldValueResult,chr(9))

        redim allXFDFValues(ubound(allXFDFHeaders))
        for m=0 to ubound(XFDFHeaders)
         for n=0 to ubound(allXFDFHeaders)
         if XFDFHeaders(m)=allXFDFHeaders(n) then allXFDFValues(n)=XFDFValues(m)
         next n
        next m
       FieldValueResult2 = FieldValueResult2 & getfilenamewithoutextension(sFileName) & ".pdf" & join(allXFDFValues(),";") & chr(13) & chr(10)

    end if
    sFileName = Dir()
    loop
    if ubound(allXFDFHeaders())=0 then
    msgbox "No .xfdf files found!"
    exit sub
    end if
    Outstring =  "File" & join(allXFDFHeaders(),";") & chr(13) & chr(10) & FieldValueResult2

    outputfile = sFolderpathOrg & "/" & "merged.csv"
    if fileexists(outputfile) then kill(outputfile)   
    
    oTextoutputStream = CreateUnoService("com.sun.star.io.TextOutputStream")
    ooutputStream = oSFA.openFileWrite(outputfile)
    oTextoutputStream.setOutputStream(ooutputStream)
    oTextoutputStream.writestring(Outstring)
    oTextoutputStream.closeOutput()
    rem msgbox "Saved: " & convertfromurl(outputfile)
    OpenCSVinCalc(outputfile)
    End Sub

    rem-----for opening in Calc
    Sub OpenCSVinCalc(Url)
    Dim oDocument as Object
    fnURL=ConvertToURL(Url)

    'Create new document and import data
    oDocument = StarDesktop.LoadComponentFromURL( fnURL, "_blank", 0, _
    Array(MakePropertyValue( "FilterName", "Text - txt - csv (StarCalc)" ), _
             MakePropertyValue( "FilterOptions", "59,34,76,1,1" )

    End Sub

    Function MakePropertyValue( Optional cName As String, Optional uValue ) As com.sun.star.beans.PropertyValue
       oPropertyValue = createUnoStruct( "com.sun.star.beans.PropertyValue" )
       If Not IsMissing( cName ) Then
          oPropertyValue.Name = cName
       EndIf
       If Not IsMissing( uValue ) Then
          oPropertyValue.Value = uValue
       EndIf
       MakePropertyValue() = oPropertyValue
    End Function
    
    
    function convertXMLstring(instring)
	instring=replace(instring,"#","%")
	convertXMLstring = convertfromurl(instring)
	end function
    
sub calc_replaceLinebreak_UsedArea
Doc = ThisComponent
If Doc.SupportsService("com.sun.star.sheet.SpreadsheetDocument") Then
else 
exit sub
end if
Sheet = Doc.Sheets.getByIndex(0)
oCursor = Sheet.createCursor()
oCursor.gotoEndofUsedArea(false)

for i=0 to oCursor.RangeAddress.EndColumn
j=0
for j=0 to oCursor.RangeAddress.EndRow
Sheet.getCellByPosition(i,j).string = replace(Sheet.getCellByPosition(i,j).string,"\n",chr(10))
'Sheet.getCellByPosition(i,j).string = replace(Sheet.getCellByPosition(i,j).string,chr(10),"\n")
next j
next i
for k=0 to oCursor.RangeAddress.EndRow
oRow = Sheet.getRows().getByIndex(oCursor.RangeAddress.EndRow-k)
oRow.OptimalHeight()=True
next
for m=0 to oCursor.RangeAddress.EndColumn
oColumn = Sheet.getColumns().getByIndex(oCursor.RangeAddress.EndColumn-m)
oColumn.OptimalWidth()=True
next
end sub
Win7 Pro, Lubuntu 15.10, LO 4.4.7, OO 4.1.3
Free Project: LibreOffice Songbook Architect (LOSA)
http://struckkai.blogspot.de/2015/04/li ... itect.html
elmanuelito
Posts: 1
Joined: Fri Feb 15, 2019 9:05 pm

Re: Create CSV from Formdata of PDFs in a folder (using PDFt

Post by elmanuelito »

Thank you for your script. I'm new here, which programming language is this? Should this script be run somehow in OpenOffice? Or is this something standalone.

Thanks a lot!
OpenOffice 4.1.6 on Windows 10
musikai
Volunteer
Posts: 294
Joined: Wed Nov 11, 2015 12:19 am

Re: Create CSV from Formdata of PDFs in a folder (using PDFt

Post by musikai »

This is a BASIC macro. You have to copy this into a Module and run "Folder_of_PDFs_to_CSV".

For managing Libraries and Modules go to "Tools"-"Macros"-"Organise Macros"-"OpenOffice Basic"
E.g. choose " My Macros", "Standard" and copy into Module 1.

Read the book "Macros Explained": http://www.pitonyak.org/oo.php

NOTE: For this macro you must have PDFtk from PDF-Labs installed on your system.
Win7 Pro, Lubuntu 15.10, LO 4.4.7, OO 4.1.3
Free Project: LibreOffice Songbook Architect (LOSA)
http://struckkai.blogspot.de/2015/04/li ... itect.html
Post Reply