
PXCp_ET_GetElement shown incorrect in simple chinese
Moderators: PDF-XChange Support, Daniel - PDF-XChange, Chris - PDF-XChange, Sean - PDF-XChange, Vasyl - PDF-XChange, Stefan - PDF-XChange
-
- User
- Posts: 12
- Joined: Mon Apr 27, 2009 3:50 am
PXCp_ET_GetElement shown incorrect in simple chinese
i extract all text into text file in simple chinese ,but i can't get the correct result 

You do not have the required permissions to view the files attached to this post.
-
- User
- Posts: 12
- Joined: Mon Apr 27, 2009 3:50 am
Re: PXCp_ET_GetElement shown incorrect in simple chinese
the source file
procedure TFormMainMenu.TextExtractFormattedExecute(Sender: TObject);
var
hr: HRESULT;
fontCount, curFontNameLen, i, bufLen, StyleLen, tempBufLen, PageCnt: DWORD;
TextElCount, CurCount, CurPage, t, j: DWORD;
fontIDs: array of DWORD;
pwBuf: PWChar;
hDstDoc: hPDF;
fontName, sFileName: WideString;
bNoFontNameSet: bool;
rcMediaBox, rcCropBox: PXC_RectF;
nAngle: LONG;
hDstPage: PXCPage;
pto: PXC_TextOptions;
TextElement: PXP_TextElement;
ptTextOrg: PXC_PointF;
buf: array[0..1] of WChar;
const
DefaultFontName: WideString = 'Arial';
begin
hr := DS_OK;
LogNL(#10'Extract all text from PDF document and save it into the other document keeping formating:');
Log('Preparsing document: ');
hr := PXCp_ET_Prepare(hDocument);
if (not ErrorCheck(hr)) then
begin
LogNL;
Exit;
end;
LogNL;
fontCount := 0;
hDstDoc := 0;
// 1. Get all fonts from the doc
hr := PXCp_ET_GetFontCount(hDocument, @fontCount);
if (not IS_DS_FAILED(hr)) then
begin
// 2. Create new doc
hr := PXC_NewDocument(@hDstDoc, nil, nil);
if (not IS_DS_FAILED(hr)) then
begin
SetLength(fontIDs, fontCount);
// buffer for font name + font style
curFontNameLen := 0;
for i := 0 to fontCount - 1 do
begin
bufLen := 0;
// get font name length
// if it is equal to 1 then there is no font name
// "1" is null-terminator in this case
if (not IS_DS_FAILED(hr)) then
hr := PXCp_ET_GetFontName(hDocument, i, nil, @bufLen);
if (not IS_DS_FAILED(hr)) then
begin
// Check if the font have any name set
bNoFontNameSet := bufLen <= 1;
// if there is no font name default 'Arial' will be used
if (bNoFontNameSet) then
bufLen := 6; // ::lstrlenW(DefaultFontName)
// Get the length of font style
StyleLen := 0;
hr := PXCp_ET_GetFontStyle(hDocument, i, nil, @StyleLen);
// Check if there is font style s箦
if (StyleLen <= 1) then
StyleLen := 0;
if ((not IS_DS_FAILED(hr)) and (StyleLen <> 0)) then
// if there is font style set - then adjust the buffer length
bufLen := bufLen + StyleLen;
// Check for necessary buffer
if (bufLen > Length(fontName)) then
SetLength(fontName, bufLen);
if (bNoFontNameSet) then
// if there is no file name - copy default name
fontName := DefaultFontName
else
begin
// else aquire font name from the library
tempBufLen := bufLen;
hr := PXCp_ET_GetFontName(hDocument, i, PWChar(fontName), @tempBufLen);
end;
if ((not IS_DS_FAILED(hr)) and (StyleLen <> 0)) then
// if there is font style set - aquire it
hr := PXCp_ET_GetFontStyle(hDocument, i, PWChar(fontName) + (bufLen - StyleLen) - 1, @StyleLen);
// add the font into library
if (not IS_DS_FAILED(hr)) then
hr := PXC_AddFontW(hDstDoc, FW_NORMAL, false, PWChar(fontName), @fontIDs);
end;
end;
end;
// clean unnecessary buffer
PageCnt := 0;
if (not IS_DS_FAILED(hr)) then
hr := PXCp_GetPagesCount(hDocument, @PageCnt);
if ((not IS_DS_FAILED(hr)) and (PageCnt <> 0)) then
begin
// 3. for each page
Log('Processing pages: ');
for CurPage := 0 to PageCnt - 1 do
begin
// create new page in the new document
if (not IS_DS_FAILED(hr)) then
hr := PXCp_PageGetBox(hDocument, CurPage, PB_MediaBox, @rcMediaBox);
// add to the new page
hDstPage := 0;
if (not IS_DS_FAILED(hr)) then
hr := PXC_AddPage(hDstDoc, rcMediaBox.right - rcMediaBox.left, rcMediaBox.top - rcMediaBox.bottom, @hDstPage);
if (not IS_DS_FAILED(hr)) then
begin
hr := PXCp_PageGetBox(hDocument, CurPage, PB_CropBox, @rcCropBox);
if (not IS_DS_FAILED(hr)) then
hr := PXC_SetPageBox(hDstPage, PB_CropBox, @rcCropBox);
hr := PXCp_PageGetRotate(hDocument, CurPage, @nAngle);
if ((not IS_DS_FAILED(hr)) and (nAngle <> 0)) then
hr := PXC_SetPageRotation(hDstPage, nAngle);
pto.cbSize := SizeOf(PXC_TextOptions);
PXC_GetTextOptions(hDstPage, @pto);
pto.nTextPosition := TextPosition_Baseline;
PXC_SetTextOptions(hDstPage, @pto);
// for each element
hr := PXCp_ET_AnalyzePageContent(hDocument, CurPage);
TextElCount := 0;
if (not IS_DS_FAILED(hr)) then
hr := PXCp_ET_GetElementCount(hDocument, @TextElCount);
if ((not IS_DS_FAILED(hr)) and (TextElCount <> 0)) then
begin
TextElement.cbSize := SizeOf(PXP_TextElement);
CurCount := 0;
ptTextOrg.x := 0;
ptTextOrg.y := 1;
buf[0] := #0;
buf[1] := #0;
for t := 0 to TextElCount - 1 do
begin
TextElement.Count := 0;
TextElement.mask := 0;
hr := PXCp_ET_GetElement(hDocument, t, @TextElement, 0);
if ((not IS_DS_FAILED(hr)) and (TextElement.Count > 0)) then
begin
TextElement.mask := PTEM_Text + PTEM_Offsets + PTEM_Matrix +
PTEM_FontInfo + PTEM_TextParams;
if (CurCount < TextElement.Count) then
begin
TextElement.Characters := nil;
TextElement.Offsets := nil;
SetLength(TextElement.Characters, TextElement.Count);
SetLength(TextElement.Offsets, TextElement.Count);
CurCount := TextElement.Count;
end;
hr := PXCp_ET_GetElement(hDocument, t, @TextElement, GTEF_IgnorePageRotation);
if (not IS_DS_FAILED(hr)) then
begin
// Now add this text element into new PDF document
hr := PXC_TCS_Transform(hDstPage, @TextElement.Matrix);
if (fontCount > TextElement.FontIndex) then
begin
hr := PXC_SetCurrentFont(hDstPage, fontIDs[TextElement.FontIndex], TextElement.FontSize);
hr := PXC_SetFillColor(hDstPage, TextElement.FillColor);
hr := PXC_SetStrokeColor(hDstPage, TextElement.StrokeColor);
hr := PXC_SetTextRMode(hDstPage, TextElement.RenderingMode, nil);
hr := PXC_SetTextScaling(hDstPage, TextElement.Th, nil);
hr := PXC_SetTextLeading(hDstPage, TextElement.Leading, nil);
hr := PXC_SetCharSpacing(hDstPage, TextElement.CharSpace, nil);
hr := PXC_SetWordSpacing(hDstPage, TextElement.WordSpace, nil);
for j := 0 to TextElement.Count - 2 do
begin
ptTextOrg.x := TextElement.Offsets[j];
buf[0] := TextElement.Characters[j];
hr := PXC_TextOutW(hDstPage, @ptTextOrg, buf, 1);
end
end;
end;
end;
end;
end;
TextElement.Characters := nil;
TextElement.Offsets := nil;
end;
end;
if (ErrorCheck(hr)) then
begin
LogNL;
Log('Writing:');
sFileName := WideString(FileOpen.Dialog.Files[0]) + '~';
hr := PXC_WriteDocumentExW(hDstDoc, PWChar(sFileName), $FFFFFFFF, WEF_ShowSaveDialog or WEF_RunApp, nil);
ErrorCheck(hr);
end;
LogNL;
end;
end;
// clear up
if (hDstDoc <> 0) then
begin
PXC_ReleaseDocument(hDstDoc);
hDstDoc := 0;
end;
fontIDs := nil;
PXCp_ET_Finish(hDocument);
end;
procedure TFormMainMenu.TextExtractFormattedExecute(Sender: TObject);
var
hr: HRESULT;
fontCount, curFontNameLen, i, bufLen, StyleLen, tempBufLen, PageCnt: DWORD;
TextElCount, CurCount, CurPage, t, j: DWORD;
fontIDs: array of DWORD;
pwBuf: PWChar;
hDstDoc: hPDF;
fontName, sFileName: WideString;
bNoFontNameSet: bool;
rcMediaBox, rcCropBox: PXC_RectF;
nAngle: LONG;
hDstPage: PXCPage;
pto: PXC_TextOptions;
TextElement: PXP_TextElement;
ptTextOrg: PXC_PointF;
buf: array[0..1] of WChar;
const
DefaultFontName: WideString = 'Arial';
begin
hr := DS_OK;
LogNL(#10'Extract all text from PDF document and save it into the other document keeping formating:');
Log('Preparsing document: ');
hr := PXCp_ET_Prepare(hDocument);
if (not ErrorCheck(hr)) then
begin
LogNL;
Exit;
end;
LogNL;
fontCount := 0;
hDstDoc := 0;
// 1. Get all fonts from the doc
hr := PXCp_ET_GetFontCount(hDocument, @fontCount);
if (not IS_DS_FAILED(hr)) then
begin
// 2. Create new doc
hr := PXC_NewDocument(@hDstDoc, nil, nil);
if (not IS_DS_FAILED(hr)) then
begin
SetLength(fontIDs, fontCount);
// buffer for font name + font style
curFontNameLen := 0;
for i := 0 to fontCount - 1 do
begin
bufLen := 0;
// get font name length
// if it is equal to 1 then there is no font name
// "1" is null-terminator in this case
if (not IS_DS_FAILED(hr)) then
hr := PXCp_ET_GetFontName(hDocument, i, nil, @bufLen);
if (not IS_DS_FAILED(hr)) then
begin
// Check if the font have any name set
bNoFontNameSet := bufLen <= 1;
// if there is no font name default 'Arial' will be used
if (bNoFontNameSet) then
bufLen := 6; // ::lstrlenW(DefaultFontName)
// Get the length of font style
StyleLen := 0;
hr := PXCp_ET_GetFontStyle(hDocument, i, nil, @StyleLen);
// Check if there is font style s箦
if (StyleLen <= 1) then
StyleLen := 0;
if ((not IS_DS_FAILED(hr)) and (StyleLen <> 0)) then
// if there is font style set - then adjust the buffer length
bufLen := bufLen + StyleLen;
// Check for necessary buffer
if (bufLen > Length(fontName)) then
SetLength(fontName, bufLen);
if (bNoFontNameSet) then
// if there is no file name - copy default name
fontName := DefaultFontName
else
begin
// else aquire font name from the library
tempBufLen := bufLen;
hr := PXCp_ET_GetFontName(hDocument, i, PWChar(fontName), @tempBufLen);
end;
if ((not IS_DS_FAILED(hr)) and (StyleLen <> 0)) then
// if there is font style set - aquire it
hr := PXCp_ET_GetFontStyle(hDocument, i, PWChar(fontName) + (bufLen - StyleLen) - 1, @StyleLen);
// add the font into library
if (not IS_DS_FAILED(hr)) then
hr := PXC_AddFontW(hDstDoc, FW_NORMAL, false, PWChar(fontName), @fontIDs);
end;
end;
end;
// clean unnecessary buffer
PageCnt := 0;
if (not IS_DS_FAILED(hr)) then
hr := PXCp_GetPagesCount(hDocument, @PageCnt);
if ((not IS_DS_FAILED(hr)) and (PageCnt <> 0)) then
begin
// 3. for each page
Log('Processing pages: ');
for CurPage := 0 to PageCnt - 1 do
begin
// create new page in the new document
if (not IS_DS_FAILED(hr)) then
hr := PXCp_PageGetBox(hDocument, CurPage, PB_MediaBox, @rcMediaBox);
// add to the new page
hDstPage := 0;
if (not IS_DS_FAILED(hr)) then
hr := PXC_AddPage(hDstDoc, rcMediaBox.right - rcMediaBox.left, rcMediaBox.top - rcMediaBox.bottom, @hDstPage);
if (not IS_DS_FAILED(hr)) then
begin
hr := PXCp_PageGetBox(hDocument, CurPage, PB_CropBox, @rcCropBox);
if (not IS_DS_FAILED(hr)) then
hr := PXC_SetPageBox(hDstPage, PB_CropBox, @rcCropBox);
hr := PXCp_PageGetRotate(hDocument, CurPage, @nAngle);
if ((not IS_DS_FAILED(hr)) and (nAngle <> 0)) then
hr := PXC_SetPageRotation(hDstPage, nAngle);
pto.cbSize := SizeOf(PXC_TextOptions);
PXC_GetTextOptions(hDstPage, @pto);
pto.nTextPosition := TextPosition_Baseline;
PXC_SetTextOptions(hDstPage, @pto);
// for each element
hr := PXCp_ET_AnalyzePageContent(hDocument, CurPage);
TextElCount := 0;
if (not IS_DS_FAILED(hr)) then
hr := PXCp_ET_GetElementCount(hDocument, @TextElCount);
if ((not IS_DS_FAILED(hr)) and (TextElCount <> 0)) then
begin
TextElement.cbSize := SizeOf(PXP_TextElement);
CurCount := 0;
ptTextOrg.x := 0;
ptTextOrg.y := 1;
buf[0] := #0;
buf[1] := #0;
for t := 0 to TextElCount - 1 do
begin
TextElement.Count := 0;
TextElement.mask := 0;
hr := PXCp_ET_GetElement(hDocument, t, @TextElement, 0);
if ((not IS_DS_FAILED(hr)) and (TextElement.Count > 0)) then
begin
TextElement.mask := PTEM_Text + PTEM_Offsets + PTEM_Matrix +
PTEM_FontInfo + PTEM_TextParams;
if (CurCount < TextElement.Count) then
begin
TextElement.Characters := nil;
TextElement.Offsets := nil;
SetLength(TextElement.Characters, TextElement.Count);
SetLength(TextElement.Offsets, TextElement.Count);
CurCount := TextElement.Count;
end;
hr := PXCp_ET_GetElement(hDocument, t, @TextElement, GTEF_IgnorePageRotation);
if (not IS_DS_FAILED(hr)) then
begin
// Now add this text element into new PDF document
hr := PXC_TCS_Transform(hDstPage, @TextElement.Matrix);
if (fontCount > TextElement.FontIndex) then
begin
hr := PXC_SetCurrentFont(hDstPage, fontIDs[TextElement.FontIndex], TextElement.FontSize);
hr := PXC_SetFillColor(hDstPage, TextElement.FillColor);
hr := PXC_SetStrokeColor(hDstPage, TextElement.StrokeColor);
hr := PXC_SetTextRMode(hDstPage, TextElement.RenderingMode, nil);
hr := PXC_SetTextScaling(hDstPage, TextElement.Th, nil);
hr := PXC_SetTextLeading(hDstPage, TextElement.Leading, nil);
hr := PXC_SetCharSpacing(hDstPage, TextElement.CharSpace, nil);
hr := PXC_SetWordSpacing(hDstPage, TextElement.WordSpace, nil);
for j := 0 to TextElement.Count - 2 do
begin
ptTextOrg.x := TextElement.Offsets[j];
buf[0] := TextElement.Characters[j];
hr := PXC_TextOutW(hDstPage, @ptTextOrg, buf, 1);
end
end;
end;
end;
end;
end;
TextElement.Characters := nil;
TextElement.Offsets := nil;
end;
end;
if (ErrorCheck(hr)) then
begin
LogNL;
Log('Writing:');
sFileName := WideString(FileOpen.Dialog.Files[0]) + '~';
hr := PXC_WriteDocumentExW(hDstDoc, PWChar(sFileName), $FFFFFFFF, WEF_ShowSaveDialog or WEF_RunApp, nil);
ErrorCheck(hr);
end;
LogNL;
end;
end;
// clear up
if (hDstDoc <> 0) then
begin
PXC_ReleaseDocument(hDstDoc);
hDstDoc := 0;
end;
fontIDs := nil;
PXCp_ET_Finish(hDocument);
end;
You do not have the required permissions to view the files attached to this post.
-
- Site Admin
- Posts: 5225
- Joined: Tue Jun 29, 2004 10:34 am
Re: PXCp_ET_GetElement shown incorrect in simple chinese
Please zip and attach the PDF file concerned also please
thanks
thanks
If posting files to this forum - you must archive the files to a ZIP, RAR or 7z file or they will not be uploaded - thank you.
Best regards
Tracker Support
http://www.tracker-software.com
Best regards
Tracker Support
http://www.tracker-software.com