Details
-
Improvement
-
Status: Closed
-
Major
-
Resolution: Won't Fix
-
1.8.9
-
None
-
None
Description
20 0 obj << /Type /Font /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H /Subtype /Type0 /Encoding /UniGB-UTF16-H /DescendantFonts [42 0 R] >> endobj
If the Type0 font is like above, CJK string cannot produced from org.apache.pdfbox.pdmodel.font.PDType0Font.
PDType0Font only processes embedded CMap, but ignore Predefined CJK CMap.
So Chinese, Japanese, Korean with this font cannot be extracted.
I have tried to modify PDType0Font source like this. It works.
PDType0Font.java
@Override public String encode(byte[] c, int offset, int length) throws IOException { String retval = null; if (hasToUnicode()) { retval = super.encode(c, offset, length); } if (retval == null) { int result = cmap.lookupCID(c, offset, length); if (result != -1) { retval = descendantFont.cmapEncoding(result, 2, true, null); } else { // Predefined CJK CMap // // PDF Source: // 20 0 obj // << // /Type /Font // /BaseFont /AdobeSongStd-Light,Bold-UniGB-UTF16-H // /Subtype /Type0 // /Encoding /UniGB-UTF16-H // /DescendantFonts [42 0 R] // >> // endobj // COSBase encoding = getEncoding(); if (length == 2 && encoding instanceof COSName) { String encname = ((COSName)encoding).getName(); String charset = charsetOfPredefinedCJKCMap(encname); if (charset!=null) { retval = new String(c, offset, length, charset); } } } } return retval; } /** * Predefined CJK CMap name to Java charset name * * @author Raymond Wu <raymondwu@softnext.com.tw> * @param encname Predefined CJK CMap name * @return Java charset name */ public String charsetOfPredefinedCJKCMap(String encname) { // PDF 32000-1:2008 Page 274 // Table 118 – Predefined CJK CMap names // // @See http://collinssoftware.com/computer/robots/utility/html/documentation/FontEncoding.htm // @See https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html // Unicode if (encname.contains("UTF16")) return "UTF-16BE"; if (encname.contains("UCS2")) return "UTF-16BE"; // Chinese (Traditional) // @See https://zh.wikipedia.org/wiki/巴別塔 if (encname.startsWith("B5pc-")) return "BIG5"; if (encname.startsWith("HKscs-")) return "MS950_HKSCS"; if (encname.startsWith("ETen-")) return "MS950"; if (encname.startsWith("ETenms-")) return "MS950"; if (encname.startsWith("CNS-")) return "EUC-TW"; // Chinese (Simplified) if (encname.startsWith("GB-")) return "MS936"; if (encname.startsWith("GBpc-")) return "GB2312"; if (encname.startsWith("GBK-")) return "MS936"; if (encname.startsWith("GBKp-")) return "MS936"; if (encname.startsWith("GBK2K-")) return "GB18030"; // Japanese if (encname.startsWith("83pv-")) return "JISAutoDetect"; // JIS X 0208 + KanjiTalk6 (漢字6) if (encname.startsWith("90ms-")) return "JISAutoDetect"; // MS932 if (encname.startsWith("90msp-")) return "JISAutoDetect"; // MS932 if (encname.startsWith("90pv-")) return "JISAutoDetect"; // JIS X 0208 + KanjiTalk7 (漢字7) if (encname.startsWith("Add-")) return "JISAutoDetect"; // JIS X 0208 + Fujitsu FMR if (encname.startsWith("EUC-")) return "JISAutoDetect"; // JIS X 0208 if (encname.startsWith("Ext-")) return "JISAutoDetect"; // JIS C 6226 + NEC if (encname.equals("H")) return "JISAutoDetect"; // ISO-2022-JP if (encname.equals("V")) return "JISAutoDetect"; // ISO-2022-JP // Korean if (encname.startsWith("KSC-")) return "EUC_KR"; if (encname.startsWith("KSCms-")) return "MS949"; if (encname.startsWith("KSCpc-")) return "EUC_KR"; return null; }