However, this only works if the region area specified by the user encompasses a PDF text field. Is it possible to amend the code to get the text characters within the region, regardless of the text field area?
Code: Select all
Public Function GetTextFromSquare(ByVal areaLeft As Double, ByVal areaTop As Double, ByVal areaRight As Double, ByVal areaBottom As Double, ByVal useArea As Boolean, ByVal runRegExTest As Boolean, ByVal regExStr As String, ByVal PDFDoc As IPXV_Document, ByVal m_pxcInst As IPXC_Inst, ByVal m_auxInst As IAUX_Inst, ByVal pageNumber As Integer, ByVal scrapAllPages As Boolean) As InfoParams
Dim params = New InfoParams
params.success = True
' If areaTop is less than areaBottom then reverse
If areaTop < areaBottom Then
Dim tmpAreaTop As Double = areaTop
Dim tmpAreaBottom As Double = areaBottom
areaTop = tmpAreaBottom
areaBottom = tmpAreaTop
End If
If areaLeft > areaRight Then
Dim tmpAreaLeft As Double = areaLeft
Dim tmpAreaRight As Double = areaRight
areaLeft = tmpAreaRight
areaRight = tmpAreaLeft
End If
params.myFoundCharsDT = New DataTable
params.myFoundCharsDT.Columns.Add("index", GetType(Integer))
params.myFoundCharsDT.Columns.Add("xmin", GetType(Double))
params.myFoundCharsDT.Columns.Add("xmax", GetType(Double))
params.myFoundCharsDT.Columns.Add("ymin", GetType(Double))
params.myFoundCharsDT.Columns.Add("ymax", GetType(Double))
params.myFoundCharsDT.Columns.Add("char", GetType(String))
' Builds line datatable
params.myFoundLinesDT = New DataTable
params.myFoundLinesDT.Columns.Add("index", GetType(Integer))
params.myFoundLinesDT.Columns.Add("midX", GetType(Double))
params.myFoundLinesDT.Columns.Add("midY", GetType(Double))
params.myFoundLinesDT.Columns.Add("horizontal", GetType(Boolean))
params.myFoundLinesDT.Columns.Add("text", GetType(String))
params.myFoundLinesDT.Columns.Add("boundLeft", GetType(Double))
params.myFoundLinesDT.Columns.Add("boundRight", GetType(Double))
params.myFoundLinesDT.Columns.Add("boundTop", GetType(Double))
params.myFoundLinesDT.Columns.Add("boundBottom", GetType(Double))
Try
params.myFoundCharsDT.Rows.Clear()
params.myFoundLinesDT.Rows.Clear()
Dim FirstChar As UInteger = 0
Dim CharCount As UInteger = 0
Dim pdfWord As String = Nothing
Dim isHorizontal As Boolean = Nothing
'For pageNum As UInteger = 0 To PDFControl.Doc.CoreDoc.Pages.Count - 1
If PDFDoc IsNot Nothing Then
For pageNum As UInteger = 0 To PDFDoc.CoreDoc.Pages.Count - 1
If pageNum = pageNumber Or pageNumber = Nothing Then
'Dim page As IPXC_Page = PDFControl.Doc.CoreDoc.Pages(pageNum)
Dim page As IPXC_Page = PDFDoc.CoreDoc.Pages(pageNum)
Dim getTextOptions As IPXC_GetPageTextOptions = m_pxcInst.CreateGetPageTextOptions(1)
Dim pageText As IPXC_PageText = page.GetText(getTextOptions, False)
Dim pageMatrix As PXC_Matrix = page.Matrix
If pageText.LinesCount <> 0 Then
For lineCount As UInteger = 0 To CUInt(pageText.LinesCount - 1)
FirstChar = pageText.LineInfo(lineCount).nFirstCharIndex
CharCount = pageText.LineInfo(lineCount).nCharsCount
Dim charLoop As UInteger = FirstChar
Dim iChars As Integer = FirstChar + CharCount
Dim matchFirstChar As Integer = Nothing
Dim matchCount As Integer = 0
Dim midX As Double = Nothing
Dim midY As Double = Nothing
Dim charMin As PXC_Rect = Nothing
Dim charMax As PXC_Rect = Nothing
Do While (charLoop < iChars)
Dim vChar As PXC_Rect = pageText.CharRect(charLoop)
If useArea = True Then
If (vChar.top <= areaTop) AndAlso (vChar.bottom >= areaBottom) AndAlso (vChar.left >= areaLeft) AndAlso (vChar.right <= areaRight) Then
If matchFirstChar = Nothing Then
matchFirstChar = charLoop
midX = vChar.bottom + ((vChar.top - vChar.bottom) / 2)
charMin = vChar
End If
matchCount = matchCount + 1
charMax = vChar
'Console.WriteLine("Char: " & Convert.ToChar(pageText.Char(charLoop)) & " xmin: " & vChar.top & " xmax: " & vChar.bottom & " ymin: " & vChar.left & " ymax: " & vChar.right)
params.myFoundCharsDT.Rows.Add(lineCount, vChar.top, vChar.bottom, vChar.left, vChar.right, Convert.ToChar(pageText.Char(charLoop)))
End If
Else
If matchFirstChar = Nothing Then
matchFirstChar = charLoop
midX = vChar.bottom + ((vChar.top - vChar.bottom) / 2)
charMin = vChar
End If
matchCount = matchCount + 1
charMax = vChar
End If
charLoop = (charLoop + 1)
Loop
If matchFirstChar <> Nothing Then
pdfWord = Regex.Replace(pageText.GetChars(matchFirstChar, matchCount), " {2,}", " ")
pdfWord = pdfWord.Replace(Chr(34), """").Trim
midY = charMin.left + ((charMin.right - charMin.left) / 2)
If charMin.bottom = charMax.bottom Then
isHorizontal = False
Else
isHorizontal = True
End If
If runRegExTest = True Then
regExStr = regExStr.Replace(Chr(34), """")
Dim r As Regex = New Regex(regExStr, RegexOptions.IgnoreCase Or RegexOptions.Singleline)
Dim m As Match = r.Match(pdfWord)
If m.Success Then
params.myFoundLinesDT.Rows.Add(lineCount, midX, midY, isHorizontal, pdfWord, charMin.left, charMax.right, charMin.top, charMax.bottom)
End If
Else
params.myFoundLinesDT.Rows.Add(lineCount, midX, midY, isHorizontal, pdfWord, charMin.left, charMax.right, charMin.top, charMax.bottom)
End If
End If
Next
params.myFoundLinesDT.DefaultView.Sort = "midY DESC, midX DESC"
params.returnString = ""
For Each myRow As DataRowView In params.myFoundLinesDT.DefaultView
If useArea = True Then
params.returnString = params.returnString & " " & myRow("text").ToString
End If
Next
End If
If pageNumber <> Nothing Then
Exit For
End If
End If
' Only scrap single page, otherwise the document and rev numbers combine from all pages
If scrapAllPages = False And pageNum = 0 Then
Exit For
End If
Next
End If
Return params
Catch ex As Exception
params.success = False
params.errorCode = "37,001"
params.errorException = ex.Message
params.errorMessage = "Error getting text from area on document"
params.errorPDFException = convertErr(ex, m_auxInst).ToString
Return params
End Try
End Function
As shown below is an example:
Thanks
Simon