Scrape text not in PDF text line field

lidds · Post by **lidds** » Wed Dec 17, 2025 11:04 pm

I am using the following code to gather text from a PDF.

However, this only works if the region area specified by the user encompasses a PDF text field. Is it possible to amend the code to get the text characters within the region, regardless of the text field area?

Code: Select all

Public Function GetTextFromSquare(ByVal areaLeft As Double, ByVal areaTop As Double, ByVal areaRight As Double, ByVal areaBottom As Double, ByVal useArea As Boolean, ByVal runRegExTest As Boolean, ByVal regExStr As String, ByVal PDFDoc As IPXV_Document, ByVal m_pxcInst As IPXC_Inst, ByVal m_auxInst As IAUX_Inst, ByVal pageNumber As Integer, ByVal scrapAllPages As Boolean) As InfoParams
    Dim params = New InfoParams
    params.success = True

    ' If areaTop is less than areaBottom then reverse
    If areaTop < areaBottom Then
        Dim tmpAreaTop As Double = areaTop
        Dim tmpAreaBottom As Double = areaBottom
        areaTop = tmpAreaBottom
        areaBottom = tmpAreaTop
    End If

    If areaLeft > areaRight Then
        Dim tmpAreaLeft As Double = areaLeft
        Dim tmpAreaRight As Double = areaRight
        areaLeft = tmpAreaRight
        areaRight = tmpAreaLeft
    End If

    params.myFoundCharsDT = New DataTable
    params.myFoundCharsDT.Columns.Add("index", GetType(Integer))
    params.myFoundCharsDT.Columns.Add("xmin", GetType(Double))
    params.myFoundCharsDT.Columns.Add("xmax", GetType(Double))
    params.myFoundCharsDT.Columns.Add("ymin", GetType(Double))
    params.myFoundCharsDT.Columns.Add("ymax", GetType(Double))
    params.myFoundCharsDT.Columns.Add("char", GetType(String))

    ' Builds line datatable
    params.myFoundLinesDT = New DataTable
    params.myFoundLinesDT.Columns.Add("index", GetType(Integer))
    params.myFoundLinesDT.Columns.Add("midX", GetType(Double))
    params.myFoundLinesDT.Columns.Add("midY", GetType(Double))
    params.myFoundLinesDT.Columns.Add("horizontal", GetType(Boolean))
    params.myFoundLinesDT.Columns.Add("text", GetType(String))
    params.myFoundLinesDT.Columns.Add("boundLeft", GetType(Double))
    params.myFoundLinesDT.Columns.Add("boundRight", GetType(Double))
    params.myFoundLinesDT.Columns.Add("boundTop", GetType(Double))
    params.myFoundLinesDT.Columns.Add("boundBottom", GetType(Double))

    Try
        params.myFoundCharsDT.Rows.Clear()
        params.myFoundLinesDT.Rows.Clear()

        Dim FirstChar As UInteger = 0
        Dim CharCount As UInteger = 0
        Dim pdfWord As String = Nothing

        Dim isHorizontal As Boolean = Nothing

        'For pageNum As UInteger = 0 To PDFControl.Doc.CoreDoc.Pages.Count - 1
        If PDFDoc IsNot Nothing Then
            For pageNum As UInteger = 0 To PDFDoc.CoreDoc.Pages.Count - 1
                If pageNum = pageNumber Or pageNumber = Nothing Then
                    'Dim page As IPXC_Page = PDFControl.Doc.CoreDoc.Pages(pageNum)
                    Dim page As IPXC_Page = PDFDoc.CoreDoc.Pages(pageNum)
                    Dim getTextOptions As IPXC_GetPageTextOptions = m_pxcInst.CreateGetPageTextOptions(1)
                    Dim pageText As IPXC_PageText = page.GetText(getTextOptions, False)
                    Dim pageMatrix As PXC_Matrix = page.Matrix

                    If pageText.LinesCount <> 0 Then
                        For lineCount As UInteger = 0 To CUInt(pageText.LinesCount - 1)
                            FirstChar = pageText.LineInfo(lineCount).nFirstCharIndex
                            CharCount = pageText.LineInfo(lineCount).nCharsCount

                            Dim charLoop As UInteger = FirstChar
                            Dim iChars As Integer = FirstChar + CharCount

                            Dim matchFirstChar As Integer = Nothing
                            Dim matchCount As Integer = 0
                            Dim midX As Double = Nothing
                            Dim midY As Double = Nothing
                            Dim charMin As PXC_Rect = Nothing
                            Dim charMax As PXC_Rect = Nothing

                            Do While (charLoop < iChars)
                                Dim vChar As PXC_Rect = pageText.CharRect(charLoop)

                                If useArea = True Then
                                    If (vChar.top <= areaTop) AndAlso (vChar.bottom >= areaBottom) AndAlso (vChar.left >= areaLeft) AndAlso (vChar.right <= areaRight) Then
                                        If matchFirstChar = Nothing Then
                                            matchFirstChar = charLoop
                                            midX = vChar.bottom + ((vChar.top - vChar.bottom) / 2)
                                            charMin = vChar
                                        End If

                                        matchCount = matchCount + 1
                                        charMax = vChar

                                        'Console.WriteLine("Char: " & Convert.ToChar(pageText.Char(charLoop)) & " xmin: " & vChar.top & " xmax: " & vChar.bottom & " ymin: " & vChar.left & " ymax: " & vChar.right)

                                        params.myFoundCharsDT.Rows.Add(lineCount, vChar.top, vChar.bottom, vChar.left, vChar.right, Convert.ToChar(pageText.Char(charLoop)))
                                    End If
                                Else
                                    If matchFirstChar = Nothing Then
                                        matchFirstChar = charLoop
                                        midX = vChar.bottom + ((vChar.top - vChar.bottom) / 2)
                                        charMin = vChar
                                    End If

                                    matchCount = matchCount + 1
                                    charMax = vChar
                                End If

                                charLoop = (charLoop + 1)
                            Loop

                            If matchFirstChar <> Nothing Then
                                pdfWord = Regex.Replace(pageText.GetChars(matchFirstChar, matchCount), " {2,}", " ")
                                pdfWord = pdfWord.Replace(Chr(34), """").Trim
                                midY = charMin.left + ((charMin.right - charMin.left) / 2)

                                If charMin.bottom = charMax.bottom Then
                                    isHorizontal = False
                                Else
                                    isHorizontal = True
                                End If

                                If runRegExTest = True Then
                                    regExStr = regExStr.Replace(Chr(34), """")
                                    Dim r As Regex = New Regex(regExStr, RegexOptions.IgnoreCase Or RegexOptions.Singleline)
                                    Dim m As Match = r.Match(pdfWord)
                                    If m.Success Then
                                        params.myFoundLinesDT.Rows.Add(lineCount, midX, midY, isHorizontal, pdfWord, charMin.left, charMax.right, charMin.top, charMax.bottom)
                                    End If
                                Else
                                    params.myFoundLinesDT.Rows.Add(lineCount, midX, midY, isHorizontal, pdfWord, charMin.left, charMax.right, charMin.top, charMax.bottom)
                                End If
                            End If
                        Next

                        params.myFoundLinesDT.DefaultView.Sort = "midY DESC, midX DESC"
                        params.returnString = ""
                        For Each myRow As DataRowView In params.myFoundLinesDT.DefaultView
                            If useArea = True Then
                                params.returnString = params.returnString & " " & myRow("text").ToString
                            End If
                        Next
                    End If

                    If pageNumber <> Nothing Then
                        Exit For
                    End If
                End If

                ' Only scrap single page, otherwise the document and rev numbers combine from all pages
                If scrapAllPages = False And pageNum = 0 Then
                    Exit For
                End If
            Next
        End If

        Return params
    Catch ex As Exception
        params.success = False
        params.errorCode = "37,001"
        params.errorException = ex.Message
        params.errorMessage = "Error getting text from area on document"
        params.errorPDFException = convertErr(ex, m_auxInst).ToString

        Return params
    End Try
End Function

As shown below is an example:

image.png

Thanks

Simon

lidds · Post by **lidds** » Tue Dec 23, 2025 5:24 pm

I was just wondering if anyone has an idea on how to do this?

Thanks

Simon

Thu Jan 08, 2026 10:46 pm

Please try to set one parameter in your code:

getTextOptions.TableDetectMode = TDM_ByLine;

It may try to split the text-lines according to the table structures.

HTH.

Scrape text not in PDF text line field

Scrape text not in PDF text line field

Re: Scrape text not in PDF text line field

Re: Scrape text not in PDF text line field