Skip to content

Commit 1284f73

Browse files
LocNguyenSGUbundolee
authored andcommitted
fix: update CID warning hybrid guidance
1 parent ee88123 commit 1284f73

2 files changed

Lines changed: 5 additions & 3 deletions

File tree

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/ContentFilterProcessor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ public static List<IObject> getFilteredContents(String inputPdfName, List<IChunk
7878
LOGGER.log(Level.WARNING,
7979
"Page {0}: {1,number,#.#%} of characters are replacement characters (U+FFFD). "
8080
+ "This PDF likely contains CID-keyed fonts without ToUnicode mappings. "
81-
+ "Text extraction may be incomplete. Consider using --hybrid-mode for OCR fallback.",
81+
+ "Text extraction may be incomplete. Consider enabling hybrid OCR fallback with --hybrid docling-fast.",
8282
new Object[]{pageNumber + 1, replacementCharRatio});
8383
}
8484
TextProcessor.replaceUndefinedCharacters(pageContents, config.getReplaceInvalidChars());

java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/CidFontDetectionTest.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,11 @@ public void testCidPdfWarningLogEmitted() throws IOException {
120120
);
121121

122122
boolean hasReplacementWarning = warnings.stream()
123-
.anyMatch(w -> w.contains("replacement characters"));
123+
.anyMatch(w -> w.contains("replacement characters")
124+
&& w.contains("--hybrid docling-fast")
125+
&& !w.contains("--hybrid-mode for OCR fallback"));
124126
Assertions.assertTrue(hasReplacementWarning,
125-
"Expected WARNING log about replacement characters");
127+
"Expected WARNING log about replacement characters with actionable hybrid guidance");
126128
} finally {
127129
logger.removeHandler(handler);
128130
}

0 commit comments

Comments
 (0)