Create CSV from Formdata of PDFs in a folder (using PDFtk)

Creating Extension - Shared Libraries
Forum rules
For sharing working examples of macros / scripts. These can be in any script language supported by OpenOffice.org [Basic, Python, Netbean] or as source code files in Java or C# even - but requires the actual source code listing. This forum is not for asking questions about writing your own macros.

Create CSV from Formdata of PDFs in a folder (using PDFtk)

Postby musikai » Thu Dec 14, 2017 1:33 am

With PDFtk installed this will extract all formfield data of all PDFs in a chosen folder and create a CSV.
This is a chained version of these seperate codes:
Extract form data of Folder of PDFs (using PDFtk)
https://forum.openoffice.org/en/forum/viewtopic.php?f=21&t=91586
Convert Folder of Formdata.txt (created with PDFtk) to .xfdf
https://forum.openoffice.org/en/forum/viewtopic.php?f=21&t=91587
XFDF to CSV-Converter
https://forum.openoffice.org/en/forum/viewtopic.php?f=21&t=91524
Calc: Replace "\n" with linebreaks
https://forum.openoffice.org/en/forum/viewtopic.php?f=21&t=91651

At the End a "merged.csv" is created in the chosen Folder and automatically opened in Calc.

Code: Select all   Expand viewCollapse view
Sub Folder_of_PDFs_to_CSV

    If (Not GlobalScope.BasicLibraries.isLibraryLoaded("Tools")) Then GlobalScope.BasicLibraries.LoadLibrary("Tools")
    oSFA = createUnoService("com.sun.star.ucb.SimpleFileAccess")
   
    sFolderpath=""
    rem---folderpicker
    rem to not use the folderpicker set the folder here and uncomment:
    rem sFolderpath=converttourl("C:\Users\kai\Desktop")
    if sFolderpath="" then
    oDialog = CreateUnoService("com.sun.star.ui.dialogs.FolderPicker")

    If oDialog.Execute() = 1 Then
    sFolderpath = oDialog.getDirectory
    else
    exit sub
    end if

    end if
    if not oSFA.Exists(sFolderpath) then
    msgbox "Folder not found!"
    exit sub
    end if

   rem-----if file is opened in another application
   if fileexists(sFolderpath & "/" & "merged.csv") then
   On Error goto Fileaccesserror
   proceed:
   kill(sFolderpath & "/" & "merged.csv")
   end if


   rem oProgressBar = ThisComponent.CurrentController.getStatusIndicator()
   rem oProgressBar.start( "Creating CSV", 4 )
   rem oProgressBar.setValue( 1 )
   PDFTK_dumpdatafields_of_Folder_with_PDFs_to_txt_files(sFolderpath)
   rem oProgressBar.setValue( 2 )
   Convert_Folder_of_pdftk_dumpdatafieldsTXTs_to_XFDFs(sFolderpath)
   rem oProgressBar.setValue( 3 )
    Convert_multipleXFDFs_to_1CSV(sFolderpath)
   rem oProgressBar.setValue( 4 )
    calc_replaceLinebreak_UsedArea

   rem oProgressBar.end

   exit sub
   Fileaccesserror:
   closemessage = """" & convertfromurl(sFolderpath & "/" & "merged.csv") & """ seems to be opened in another application."  & chr(10) & chr(10) & "Please close the file!"
   msgbox(closemessage,0)
   rem resume proceed
End sub


    Sub PDFTK_dumpdatafields_of_Folder_with_PDFs_to_txt_files(sFolderpath)

   pdftkapp="pdftk"

    sFileName=""
    sFileName = Dir(sFolderpath & "/", 0)
    Do While (sFileName <> "")
    if GetFileNameExtension(sFileName)="pdf" then
    Fileurl=sFolderpath & "/" & sFileName
   inputfile=convertfromurl(Fileurl)

   mkdir(sFolderpath & "/" & "PDFtk-DataFields")
   rem indirect way because of PDFtk utf8 filename bug
   outputfiledummy = sFolderpath & "/" & "PDFtk-DataFields" & "/" & "datafields.txt"
   outputfile = sFolderpath & "/" & "PDFtk-DataFields" & "/" & getfilenamewithoutextension(sFileName) & ".txt"
    if fileexists(outputfiledummy) then kill(outputfiledummy)
    outputfiledummy=convertfromurl(outputfiledummy)
    if fileexists(outputfile) then kill(outputfile)
    Shell(pdftkapp,0, """" & inputfile & """" & " dump_data_fields_utf8 output " & """" & outputfiledummy & """", true)
    rem indirect way because of PDFtk utf8 filename bug
   
      if fileexists(outputfiledummy) then
      Name outputfiledummy as outputfile
      end if
   end if
    sFileName = Dir()
    loop

    rem msgbox "Files saved to: " & convertfromurl(sFolderpath & "/" & "PDFtk-DataFields")

    End Sub
   
   
    Sub Convert_Folder_of_pdftk_dumpdatafieldsTXTs_to_XFDFs(sFolderpath)
   oSFA = createUnoService("com.sun.star.ucb.SimpleFileAccess")
   
   sFolderpath2 = sFolderpath & "/" & "PDFtk-DataFields"

    sFileName=""
    sFileName = Dir(sFolderpath2 & "/", 0)
    Do While (sFileName <> "")
    if GetFileNameExtension(sFileName)="txt" then
    Fileurl=sFolderpath2 & "/" & sFileName

   oTextinputStream = CreateUnoService("com.sun.star.io.TextInputStream")
   inputfile = converttourl(Fileurl)
   oinputStream = oSFA.openFileRead(inputfile)
   oTextinputStream.setInputStream(oinputStream)
   
   mkdir(sFolderpath2 & "/" & "XFDF")
   outputfile = converttourl(sFolderpath2 & "/" & "XFDF" & "/" & getfilenamewithoutextension(sFileName) & ".xfdf")
    if fileexists(outputfile) then kill(outputfile)   
   
    oTextoutputStream = CreateUnoService("com.sun.star.io.TextOutputStream")
    ooutputStream = oSFA.openFileWrite(outputfile)
    oTextoutputStream.setOutputStream(ooutputStream)
   
   oTextoutputStream.writestring("<?xml version=""1.0"" encoding=""UTF-8""?>" & chr(10) & "<xfdf xmlns=""http://ns.adobe.com/xfdf/"" xml:space=""preserve"">" & chr(10))
   rem oTextoutputStream.writestring("<f href=""" & convertfromurl(inputfile) & """/>" & chr(10))   
   oTextoutputStream.writestring("<fields>" & chr(10))
   
   
   foundFieldName=0
   foundFieldValue=0

   do while oTextinputStream.isEOF()=0
   stringA = oTextinputStream.readLine()
   if instr(stringA,"Field")=0 and instr(stringA,": ")=0 and foundFieldValue=1 then
   FieldValue = FieldValue & chr(10) & stringA
   elseif foundFieldValue=1 then
   oTextoutputStream.writestring("<value>" & FieldValue & "</value>" & chr(10))
   foundFieldValue=0
   end if
   if instr(stringA,"FieldName: ")=1 then
   FieldName=mid(stringA,len("FieldName: ")+1)
   foundFieldName=1
   oTextoutputStream.writestring("<field name=""" & FieldName & """>" & chr(10))
   end if
   if instr(stringA,"FieldValue: ")=1 then
   FieldValue=mid(stringA,len("FieldValue: ")+1)
   foundFieldValue=1
   end if
   
   if instr(stringA,"---")=1 and foundFieldName then
   oTextoutputStream.writestring("</field>" & chr(10))
   foundFieldName=0
   end if
   
   loop
   
   if foundFieldName then oTextoutputStream.writestring("</field>" & chr(10))
   oTextoutputStream.writestring("</fields>" & chr(10) & "</xfdf>" )
       
   oTextinputStream.closeInput
   oTextoutputStream.closeOutput()
    end if
    sFileName = Dir()
    loop

    End Sub


   
    Sub Convert_multipleXFDFs_to_1CSV(sFolderpath)
   
    sFolderpathOrg=sFolderpath
    sFolderpath=sFolderpath & "/" & "PDFtk-DataFields" & "/" & "XFDF"

    dim allXFDFHeaders(0)
    dim allXFDFValues()
    sFileName=""
    sFileName = Dir(sFolderpath & "/", 0)
    Do While (sFileName <> "")
    if GetFileNameExtension(sFileName)="xfdf" then
    Fileurl=sFolderpath & "/" & sFileName

       oSFA = createUnoService("com.sun.star.ucb.SimpleFileAccess")
       oInStream = oSFA.openFileRead(Fileurl)
       oDOM = createUnoService("com.sun.star.xml.dom.DocumentBuilder")
       oXML = oDOM.parse(oInStream)
       oInStream.closeInput
       
       nodeListFields = oXML.getElementsByTagName("fields").item(0).getElementsByTagName("field")

       for i=0 to nodeListFields.length-1
          curFieldNode = nodeListFields.item(i)
       If curFieldNode.hasAttributes Then
       nodeAttributes = curFieldNode.Attributes
          For k=0 To nodeAttributes.Length - 1
                additem=1
                 for j=0 to ubound(allXFDFHeaders)
              if allXFDFHeaders(j)=convertXMLstring(nodeAttributes.item(k).Value) then additem=0
               next j
             
            if additem=1 then
            redim preserve allXFDFHeaders(ubound(allXFDFHeaders)+1)
          allXFDFHeaders(ubound(allXFDFHeaders))=convertXMLstring(nodeAttributes.item(k).Value)
          end if

          Next k
       end if
         
       next i
    end if
    sFileName = Dir()
    loop
    k=0
    i=0
    j=0
       

    sFileName = Dir(sFolderpath & "/", 0)
    Do While (sFileName <> "")
    if GetFileNameExtension(sFileName)="xfdf" then
    Fileurl=sFolderpath & "/" & sFileName

       oSFA = createUnoService("com.sun.star.ucb.SimpleFileAccess")
       oInStream = oSFA.openFileRead(Fileurl)
       oDOM = createUnoService("com.sun.star.xml.dom.DocumentBuilder")
       oXML = oDOM.parse(oInStream)
       oInStream.closeInput
       
       nodeListFields = oXML.getElementsByTagName("fields").item(0).getElementsByTagName("field")

       FieldNameResult=""
       FieldValueResult=""
       
       for i=0 to nodeListFields.length-1
          curFieldNode = nodeListFields.item(i)
       If curFieldNode.hasAttributes Then
       nodeAttributes = curFieldNode.Attributes
          For k=0 To nodeAttributes.Length - 1
          FieldNameResult = FieldNameResult & chr(9) & convertXMLstring(nodeAttributes.item(k).Value)
          Next k
       end if
       
       childcounter=1
       sNodeValue=""
       sNodeValuetemp=""
       If curFieldNode.hasChildNodes Then
       curChildNode = curFieldNode.firstChild
       
       Do While NOT isNull(curChildNode)
       
          If curChildNode.LocalName = "value" then
             if curChildNode.hasChildNodes then
             curChildChildNode=curChildNode.firstChild                     
               sNodeValuetemp= curChildChildNode.NodeValue
            sNodeValuetemp= replace(sNodeValuetemp,chr(10), "\n")
            sNodeValuetemp= replace(sNodeValuetemp,";", ",")                     
             else
             sNodeValuetemp=""
             end if
           
             if childcounter=1 then
            sNodeValue=sNodeValuetemp
            else
            sNodeValue = sNodeValue & "\n" & sNodeValuetemp
            end if
            
             childcounter=childcounter+1
         
          end if
       curChildNode = curChildNode.nextSibling
       
       loop
       FieldValueResult = FieldValueResult & chr(9) & sNodeValue
       end if
       
       next i
       m=0
       n=0
        XFDFHeaders = split(FieldNameResult,chr(9))
        XFDFValues = split(FieldValueResult,chr(9))

        redim allXFDFValues(ubound(allXFDFHeaders))
        for m=0 to ubound(XFDFHeaders)
         for n=0 to ubound(allXFDFHeaders)
         if XFDFHeaders(m)=allXFDFHeaders(n) then allXFDFValues(n)=XFDFValues(m)
         next n
        next m
       FieldValueResult2 = FieldValueResult2 & getfilenamewithoutextension(sFileName) & ".pdf" & join(allXFDFValues(),";") & chr(13) & chr(10)

    end if
    sFileName = Dir()
    loop
    if ubound(allXFDFHeaders())=0 then
    msgbox "No .xfdf files found!"
    exit sub
    end if
    Outstring =  "File" & join(allXFDFHeaders(),";") & chr(13) & chr(10) & FieldValueResult2

    outputfile = sFolderpathOrg & "/" & "merged.csv"
    if fileexists(outputfile) then kill(outputfile)   
   
    oTextoutputStream = CreateUnoService("com.sun.star.io.TextOutputStream")
    ooutputStream = oSFA.openFileWrite(outputfile)
    oTextoutputStream.setOutputStream(ooutputStream)
    oTextoutputStream.writestring(Outstring)
    oTextoutputStream.closeOutput()
    rem msgbox "Saved: " & convertfromurl(outputfile)
    OpenCSVinCalc(outputfile)
    End Sub

    rem-----for opening in Calc
    Sub OpenCSVinCalc(Url)
    Dim oDocument as Object
    fnURL=ConvertToURL(Url)

    'Create new document and import data
    oDocument = StarDesktop.LoadComponentFromURL( fnURL, "_blank", 0, _
    Array(MakePropertyValue( "FilterName", "Text - txt - csv (StarCalc)" ), _
             MakePropertyValue( "FilterOptions", "59,34,76,1,1" )

    End Sub

    Function MakePropertyValue( Optional cName As String, Optional uValue ) As com.sun.star.beans.PropertyValue
       oPropertyValue = createUnoStruct( "com.sun.star.beans.PropertyValue" )
       If Not IsMissing( cName ) Then
          oPropertyValue.Name = cName
       EndIf
       If Not IsMissing( uValue ) Then
          oPropertyValue.Value = uValue
       EndIf
       MakePropertyValue() = oPropertyValue
    End Function
   
   
    function convertXMLstring(instring)
   instring=replace(instring,"#","%")
   convertXMLstring = convertfromurl(instring)
   end function
   
sub calc_replaceLinebreak_UsedArea
Doc = ThisComponent
If Doc.SupportsService("com.sun.star.sheet.SpreadsheetDocument") Then
else
exit sub
end if
Sheet = Doc.Sheets.getByIndex(0)
oCursor = Sheet.createCursor()
oCursor.gotoEndofUsedArea(false)

for i=0 to oCursor.RangeAddress.EndColumn
j=0
for j=0 to oCursor.RangeAddress.EndRow
Sheet.getCellByPosition(i,j).string = replace(Sheet.getCellByPosition(i,j).string,"\n",chr(10))
'Sheet.getCellByPosition(i,j).string = replace(Sheet.getCellByPosition(i,j).string,chr(10),"\n")
next j
next i
for k=0 to oCursor.RangeAddress.EndRow
oRow = Sheet.getRows().getByIndex(oCursor.RangeAddress.EndRow-k)
oRow.OptimalHeight()=True
next
for m=0 to oCursor.RangeAddress.EndColumn
oColumn = Sheet.getColumns().getByIndex(oCursor.RangeAddress.EndColumn-m)
oColumn.OptimalWidth()=True
next
end sub
Win7 Pro, Lubuntu 15.10, LO 4.4.7, OO 4.1.3
Free Project: LibreOffice Songbook Architect (LOSA)
http://struckkai.blogspot.de/2015/04/libreofficesongbookarchitect.html
musikai
Volunteer
 
Posts: 204
Joined: Wed Nov 11, 2015 12:19 am

Return to Code Snippets

Who is online

Users browsing this forum: No registered users and 1 guest