Scrape text not in PDF text line field

PDF-XChange Editor SDK for Developers

Moderators: PDF-XChange Support, Daniel - PDF-XChange, Chris - PDF-XChange, Sean - PDF-XChange, Paul - PDF-XChange, Vasyl - PDF-XChange, Ivan - Tracker Software, Stefan - PDF-XChange

Forum rules
DO NOT post your license/serial key, or your activation code - these forums, and all posts within, are public and we will be forced to immediately deactivate your license.

When experiencing some errors, use the IAUX_Inst::FormatHRESULT method to see their description and include it in your post along with the error code.
lidds
User
Posts: 534
Joined: Sat May 16, 2009 1:55 pm

Scrape text not in PDF text line field

Post by lidds »

I am using the following code to gather text from a PDF.

However, this only works if the region area specified by the user encompasses a PDF text field. Is it possible to amend the code to get the text characters within the region, regardless of the text field area?

Code: Select all

Public Function GetTextFromSquare(ByVal areaLeft As Double, ByVal areaTop As Double, ByVal areaRight As Double, ByVal areaBottom As Double, ByVal useArea As Boolean, ByVal runRegExTest As Boolean, ByVal regExStr As String, ByVal PDFDoc As IPXV_Document, ByVal m_pxcInst As IPXC_Inst, ByVal m_auxInst As IAUX_Inst, ByVal pageNumber As Integer, ByVal scrapAllPages As Boolean) As InfoParams
    Dim params = New InfoParams
    params.success = True

    ' If areaTop is less than areaBottom then reverse
    If areaTop < areaBottom Then
        Dim tmpAreaTop As Double = areaTop
        Dim tmpAreaBottom As Double = areaBottom
        areaTop = tmpAreaBottom
        areaBottom = tmpAreaTop
    End If

    If areaLeft > areaRight Then
        Dim tmpAreaLeft As Double = areaLeft
        Dim tmpAreaRight As Double = areaRight
        areaLeft = tmpAreaRight
        areaRight = tmpAreaLeft
    End If

    params.myFoundCharsDT = New DataTable
    params.myFoundCharsDT.Columns.Add("index", GetType(Integer))
    params.myFoundCharsDT.Columns.Add("xmin", GetType(Double))
    params.myFoundCharsDT.Columns.Add("xmax", GetType(Double))
    params.myFoundCharsDT.Columns.Add("ymin", GetType(Double))
    params.myFoundCharsDT.Columns.Add("ymax", GetType(Double))
    params.myFoundCharsDT.Columns.Add("char", GetType(String))

    ' Builds line datatable
    params.myFoundLinesDT = New DataTable
    params.myFoundLinesDT.Columns.Add("index", GetType(Integer))
    params.myFoundLinesDT.Columns.Add("midX", GetType(Double))
    params.myFoundLinesDT.Columns.Add("midY", GetType(Double))
    params.myFoundLinesDT.Columns.Add("horizontal", GetType(Boolean))
    params.myFoundLinesDT.Columns.Add("text", GetType(String))
    params.myFoundLinesDT.Columns.Add("boundLeft", GetType(Double))
    params.myFoundLinesDT.Columns.Add("boundRight", GetType(Double))
    params.myFoundLinesDT.Columns.Add("boundTop", GetType(Double))
    params.myFoundLinesDT.Columns.Add("boundBottom", GetType(Double))

    Try
        params.myFoundCharsDT.Rows.Clear()
        params.myFoundLinesDT.Rows.Clear()

        Dim FirstChar As UInteger = 0
        Dim CharCount As UInteger = 0
        Dim pdfWord As String = Nothing

        Dim isHorizontal As Boolean = Nothing

        'For pageNum As UInteger = 0 To PDFControl.Doc.CoreDoc.Pages.Count - 1
        If PDFDoc IsNot Nothing Then
            For pageNum As UInteger = 0 To PDFDoc.CoreDoc.Pages.Count - 1
                If pageNum = pageNumber Or pageNumber = Nothing Then
                    'Dim page As IPXC_Page = PDFControl.Doc.CoreDoc.Pages(pageNum)
                    Dim page As IPXC_Page = PDFDoc.CoreDoc.Pages(pageNum)
                    Dim getTextOptions As IPXC_GetPageTextOptions = m_pxcInst.CreateGetPageTextOptions(1)
                    Dim pageText As IPXC_PageText = page.GetText(getTextOptions, False)
                    Dim pageMatrix As PXC_Matrix = page.Matrix

                    If pageText.LinesCount <> 0 Then
                        For lineCount As UInteger = 0 To CUInt(pageText.LinesCount - 1)
                            FirstChar = pageText.LineInfo(lineCount).nFirstCharIndex
                            CharCount = pageText.LineInfo(lineCount).nCharsCount

                            Dim charLoop As UInteger = FirstChar
                            Dim iChars As Integer = FirstChar + CharCount

                            Dim matchFirstChar As Integer = Nothing
                            Dim matchCount As Integer = 0
                            Dim midX As Double = Nothing
                            Dim midY As Double = Nothing
                            Dim charMin As PXC_Rect = Nothing
                            Dim charMax As PXC_Rect = Nothing

                            Do While (charLoop < iChars)
                                Dim vChar As PXC_Rect = pageText.CharRect(charLoop)

                                If useArea = True Then
                                    If (vChar.top <= areaTop) AndAlso (vChar.bottom >= areaBottom) AndAlso (vChar.left >= areaLeft) AndAlso (vChar.right <= areaRight) Then
                                        If matchFirstChar = Nothing Then
                                            matchFirstChar = charLoop
                                            midX = vChar.bottom + ((vChar.top - vChar.bottom) / 2)
                                            charMin = vChar
                                        End If

                                        matchCount = matchCount + 1
                                        charMax = vChar

                                        'Console.WriteLine("Char: " & Convert.ToChar(pageText.Char(charLoop)) & " xmin: " & vChar.top & " xmax: " & vChar.bottom & " ymin: " & vChar.left & " ymax: " & vChar.right)

                                        params.myFoundCharsDT.Rows.Add(lineCount, vChar.top, vChar.bottom, vChar.left, vChar.right, Convert.ToChar(pageText.Char(charLoop)))
                                    End If
                                Else
                                    If matchFirstChar = Nothing Then
                                        matchFirstChar = charLoop
                                        midX = vChar.bottom + ((vChar.top - vChar.bottom) / 2)
                                        charMin = vChar
                                    End If

                                    matchCount = matchCount + 1
                                    charMax = vChar
                                End If

                                charLoop = (charLoop + 1)
                            Loop

                            If matchFirstChar <> Nothing Then
                                pdfWord = Regex.Replace(pageText.GetChars(matchFirstChar, matchCount), " {2,}", " ")
                                pdfWord = pdfWord.Replace(Chr(34), """").Trim
                                midY = charMin.left + ((charMin.right - charMin.left) / 2)

                                If charMin.bottom = charMax.bottom Then
                                    isHorizontal = False
                                Else
                                    isHorizontal = True
                                End If

                                If runRegExTest = True Then
                                    regExStr = regExStr.Replace(Chr(34), """")
                                    Dim r As Regex = New Regex(regExStr, RegexOptions.IgnoreCase Or RegexOptions.Singleline)
                                    Dim m As Match = r.Match(pdfWord)
                                    If m.Success Then
                                        params.myFoundLinesDT.Rows.Add(lineCount, midX, midY, isHorizontal, pdfWord, charMin.left, charMax.right, charMin.top, charMax.bottom)
                                    End If
                                Else
                                    params.myFoundLinesDT.Rows.Add(lineCount, midX, midY, isHorizontal, pdfWord, charMin.left, charMax.right, charMin.top, charMax.bottom)
                                End If
                            End If
                        Next

                        params.myFoundLinesDT.DefaultView.Sort = "midY DESC, midX DESC"
                        params.returnString = ""
                        For Each myRow As DataRowView In params.myFoundLinesDT.DefaultView
                            If useArea = True Then
                                params.returnString = params.returnString & " " & myRow("text").ToString
                            End If
                        Next
                    End If

                    If pageNumber <> Nothing Then
                        Exit For
                    End If
                End If

                ' Only scrap single page, otherwise the document and rev numbers combine from all pages
                If scrapAllPages = False And pageNum = 0 Then
                    Exit For
                End If
            Next
        End If

        Return params
    Catch ex As Exception
        params.success = False
        params.errorCode = "37,001"
        params.errorException = ex.Message
        params.errorMessage = "Error getting text from area on document"
        params.errorPDFException = convertErr(ex, m_auxInst).ToString

        Return params
    End Try
End Function


As shown below is an example:
image.png
Thanks

Simon
You do not have the required permissions to view the files attached to this post.
lidds
User
Posts: 534
Joined: Sat May 16, 2009 1:55 pm

Re: Scrape text not in PDF text line field

Post by lidds »

I was just wondering if anyone has an idea on how to do this?

Thanks

Simon
User avatar
Vasyl - PDF-XChange
Site Admin
Posts: 2476
Joined: Thu Jun 30, 2005 4:11 pm

Re: Scrape text not in PDF text line field

Post by Vasyl - PDF-XChange »

Please try to set one parameter in your code:

getTextOptions.TableDetectMode = TDM_ByLine;

It may try to split the text-lines according to the table structures.

HTH.
PDF-XChange Co. LTD (Project Developer)

Please archive any files posted to a ZIP, 7z or RAR file or they will be removed and not posted.