HarfBuzz 렌더러를 사용하는 동안 Devanagari UTF-8 텍스트 추출 코드에서 lua 오류가 발생합니다.

Question 1

노드 모드에서는 모양이 지정된 출력을 얻고 모양이 지정된 글리프를 입력 텍스트에 다시 고유하게 매핑할 수 없기 때문에 일반적으로 전체 텍스트를 복원하는 것이 불가능합니다. tounicode 값을 사용해서만 대략적으로 계산할 수 있습니다. 이는 실제 PDF 파일 ToUnicode CMap 항목에 매핑되므로 제한된 문자 모양 대 유니코드 매핑 모델을 따릅니다. 모든 문자 모양은 고정된 유니코드 코드 포인트 시퀀스와 동일합니다. 이러한 매핑은 렌더링 순서대로 연결됩니다. 보시다시피 이 모델은 Devanagari 문자 모양을 입력 텍스트에 매핑하는 데 충분하지 않습니다.

harf문제를 피하기 위해 모드를 대신 사용할 수 있습니다 . harf모드는 모양의 문자 모양 목록을 제공할 뿐만 아니라 정확할 수 없는 시퀀스의 ToUnicode 매핑을 재정의하는 PDF 표시 콘텐츠 ActualText 항목을 추가로 생성하기 때문에 이 제한된 모델의 영향을 받지 않습니다. ToUnicode를 통해 모델링되었습니다. 이 매핑에 필요한 데이터는 속성을 사용하여 Lua 코드에서 쿼리할 수 있습니다 glyph_data. (이것은 문서화되지 않은 구현 세부 사항이며 향후 변경될 수 있습니다.)

xou가 텍스트에서 가능한 많은 것을 추출하려면 Lua 코드에서 이 속성 기반 접근 방식과 ToUnicode 기반 접근 방식을 결합할 수 있습니다.

extracttext.lua다음을 사용하여 파일을 만듭니다.

local type = type
local char = utf8.char
local unpack = table.unpack
local getproperty = node.getproperty
local getfont = font.getfont
local is_glyph = node.is_glyph

-- tounicode id UTF-16 in hex, so we need to handle surrogate pairs...
local utf16hex_to_utf8 do -- Untested, but should more or less work
  local l = lpeg
  local tonumber = tonumber
  local hex = l.R('09', 'af', 'AF')
  local byte = hex * hex
  local simple = byte * byte / function(s) return char(tonumber(s, 16)) end
  local surrogate = l.S'Dd' * l.C(l.R('89', 'AB', 'ab') * byte)
                  * l.S'Dd' * l.C(l.R('CF', 'cf') * byte) / function(high, low)
                      return char(0x10000 + ((tonumber(high, 16) & 0x3FF) << 10 | (tonumber(low, 16) & 0x3FF)))
                    end
  utf16hex_to_utf8 = l.Cs((surrogate + simple)^0)
end

-- First the non-harf case

-- Standard caching setup
local identity_table = setmetatable({}, {__index = function(_, id) return char(id) end})
local cached_text = setmetatable({}, {__index = function(t, fid)
  local fontdir = getfont(fid)
  local characters = fontdir and fontdir.tounicode == 1 and fontdir.characters
  local font_cache = characters and setmetatable({}, {__index = function(tt, slot)
    local character = characters[slot]
    local text = character and character.tounicode or slot
    -- At this point we have the tounicode value in text. This can have different forms.
    -- The order in the if ... elseif chain is based on how likely it is to encounter them.
    -- This is a small performance optimization.
    local t = type(text)
    if t == 'string' then
      text = utf16hex_to_utf8:match(text)
    elseif t == 'number' then
      text = char(text)
    elseif t == 'table' then
      text = char(unpack(text)) -- I haven't tested this case, but it should work
    end
    tt[slot] = text
    return text
  end}) or identity_table
  t[fid] = font_cache
  return font_cache
end})

-- Now the tounicode case just has to look up the value
local function from_tounicode(n)
  local slot, fid = is_glyph(n)
  return cached_text[fid][slot]
end

-- Now the traversing stuff. Nothing interesting to see here except for the
-- glyph case
local traverse = node.traverse
local glyph, glue, disc, hlist, vlist = node.id'glyph', node.id'glue', node.id'disc', node.id'hlist', node.id'vlist'
local extract_text_vlist
-- We could replace i by #t+1 but this should be slightly faster
local function real_extract_text(head, t, i)
  for n, id in traverse(head) do
    if id == glyph then
      -- First handle harf mode: Look for a glyph_info property. If that does not exists
      -- use from_tounicode. glyph_info will sometimes/often be an empty string. That's
      -- intentional and it should *not* trigger a fallback. The actual mapping will be
      -- contained in surrounding chars.
      local props = getproperty(n)
      t[i] = props and props.glyph_info or from_tounicode(n)
      i = i + 1
    elseif id == glue then
      if n.width > 1001 then -- 1001 is arbitrary but sufficiently high to be bigger than most weird glue uses
        t[i] = ' '
        i = i + 1
      end
    elseif id == disc then
      i = real_extract_text(n.replace, t, i)
    elseif id == hlist then
      i = real_extract_text(n.head, t, i)
    elseif id == vlist then
      i = extract_text_vlist(n.head, t, i)
    end
  end
  return i
end
function extract_text_vlist(head, t, i) -- glue should not become a space here
  for n, id in traverse(head) do
    if id == hlist then
      i = real_extract_text(n.head, t, i)
    elseif id == vlist then
      i = extract_text_vlist(n.head, t, i)
    end
  end
  return i
end
return function(list)
  local t = {}
  real_extract_text(list.head, t, 1)
  return table.concat(t)
end

이는 일반 Lua 모듈로 사용할 수 있습니다.

\documentclass{article}
\usepackage{fontspec}

\newfontfamily{\devharf}{Noto Sans Devanagari}[Script=Devanagari, Renderer=HarfBuzz]
\newfontfamily{\devnode}{Noto Sans Devanagari}[Script=Devanagari, Renderer=Node]

\begin{document}

% Devanagari text is at the right end of following line
% of code, you might have to scroll right to read it
\setbox0=\hbox{Příliš žluťoučký \textit{kůň} úpěl \hbox{ďábelské} ódy difference diffierence. \devharf एक गांव -- में मोहन नाम का लड़का रहता था। उसके पिताजी एक मामूली मजदूर थे।}
\setbox1=\hbox{Příliš žluťoučký \textit{kůň} úpěl \hbox{ďábelské} ódy difference diffierence. \devnode एक गांव -- में मोहन नाम का लड़का रहता था। उसके पिताजी एक मामूली मजदूर थे।}

\directlua{
  local extracttext = require'extracttext'
  local f = io.open("hello.harf.txt","w") % Can reproduce the full input text
  f:write(extracttext(tex.getbox(0)))
  f:close()

  f = io.open("hello.node.txt","w") % In node mode, we only get an approximation
  f:write(extracttext(tex.getbox(1)))
  f:close()
}

\box0
\box1
\end{document}

보다 일반적인 참고 사항: 보시다시피, 모양이 지정된 목록에서 텍스트를 가져올 때, 특히 대리 쌍과 항목을 매핑해야 하는 ToUnicode 경우에는 몇 가지 작업이 포함됩니다. 이는 주로 모양이 있는 텍스트가~ 아니다그러한 용도로 사용됩니다. 글리프 노드가 보호되자마자(일명 subtype(n) >= 256 또는 not is_char(n)is true) .char항목에 더 이상 유니코드 값이 포함되지 않고 내부 식별자가 포함되며 .font항목은 더 이상 예상한 값이 아닐 수 있으며 일부 글리프가 표시되지 않을 수 있습니다. 전혀 글리프처럼. 텍스트의 시각적 표시뿐만 아니라 상자 뒤의 텍스트에 실제로 액세스하려는 대부분의 경우 실제로 목록을 가로채기를 원합니다.~ 전에처음에는 모양이 잡힙니다.

Answer

노드 모드에서는 모양이 지정된 출력을 얻고 모양이 지정된 글리프를 입력 텍스트에 다시 고유하게 매핑할 수 없기 때문에 일반적으로 전체 텍스트를 복원하는 것이 불가능합니다. tounicode 값을 사용해서만 대략적으로 계산할 수 있습니다. 이는 실제 PDF 파일 ToUnicode CMap 항목에 매핑되므로 제한된 문자 모양 대 유니코드 매핑 모델을 따릅니다. 모든 문자 모양은 고정된 유니코드 코드 포인트 시퀀스와 동일합니다. 이러한 매핑은 렌더링 순서대로 연결됩니다. 보시다시피 이 모델은 Devanagari 문자 모양을 입력 텍스트에 매핑하는 데 충분하지 않습니다.

harf문제를 피하기 위해 모드를 대신 사용할 수 있습니다 . harf모드는 모양의 문자 모양 목록을 제공할 뿐만 아니라 정확할 수 없는 시퀀스의 ToUnicode 매핑을 재정의하는 PDF 표시 콘텐츠 ActualText 항목을 추가로 생성하기 때문에 이 제한된 모델의 영향을 받지 않습니다. ToUnicode를 통해 모델링되었습니다. 이 매핑에 필요한 데이터는 속성을 사용하여 Lua 코드에서 쿼리할 수 있습니다 glyph_data. (이것은 문서화되지 않은 구현 세부 사항이며 향후 변경될 수 있습니다.)

xou가 텍스트에서 가능한 많은 것을 추출하려면 Lua 코드에서 이 속성 기반 접근 방식과 ToUnicode 기반 접근 방식을 결합할 수 있습니다.

extracttext.lua다음을 사용하여 파일을 만듭니다.

local type = type
local char = utf8.char
local unpack = table.unpack
local getproperty = node.getproperty
local getfont = font.getfont
local is_glyph = node.is_glyph

-- tounicode id UTF-16 in hex, so we need to handle surrogate pairs...
local utf16hex_to_utf8 do -- Untested, but should more or less work
  local l = lpeg
  local tonumber = tonumber
  local hex = l.R('09', 'af', 'AF')
  local byte = hex * hex
  local simple = byte * byte / function(s) return char(tonumber(s, 16)) end
  local surrogate = l.S'Dd' * l.C(l.R('89', 'AB', 'ab') * byte)
                  * l.S'Dd' * l.C(l.R('CF', 'cf') * byte) / function(high, low)
                      return char(0x10000 + ((tonumber(high, 16) & 0x3FF) << 10 | (tonumber(low, 16) & 0x3FF)))
                    end
  utf16hex_to_utf8 = l.Cs((surrogate + simple)^0)
end

-- First the non-harf case

-- Standard caching setup
local identity_table = setmetatable({}, {__index = function(_, id) return char(id) end})
local cached_text = setmetatable({}, {__index = function(t, fid)
  local fontdir = getfont(fid)
  local characters = fontdir and fontdir.tounicode == 1 and fontdir.characters
  local font_cache = characters and setmetatable({}, {__index = function(tt, slot)
    local character = characters[slot]
    local text = character and character.tounicode or slot
    -- At this point we have the tounicode value in text. This can have different forms.
    -- The order in the if ... elseif chain is based on how likely it is to encounter them.
    -- This is a small performance optimization.
    local t = type(text)
    if t == 'string' then
      text = utf16hex_to_utf8:match(text)
    elseif t == 'number' then
      text = char(text)
    elseif t == 'table' then
      text = char(unpack(text)) -- I haven't tested this case, but it should work
    end
    tt[slot] = text
    return text
  end}) or identity_table
  t[fid] = font_cache
  return font_cache
end})

-- Now the tounicode case just has to look up the value
local function from_tounicode(n)
  local slot, fid = is_glyph(n)
  return cached_text[fid][slot]
end

-- Now the traversing stuff. Nothing interesting to see here except for the
-- glyph case
local traverse = node.traverse
local glyph, glue, disc, hlist, vlist = node.id'glyph', node.id'glue', node.id'disc', node.id'hlist', node.id'vlist'
local extract_text_vlist
-- We could replace i by #t+1 but this should be slightly faster
local function real_extract_text(head, t, i)
  for n, id in traverse(head) do
    if id == glyph then
      -- First handle harf mode: Look for a glyph_info property. If that does not exists
      -- use from_tounicode. glyph_info will sometimes/often be an empty string. That's
      -- intentional and it should *not* trigger a fallback. The actual mapping will be
      -- contained in surrounding chars.
      local props = getproperty(n)
      t[i] = props and props.glyph_info or from_tounicode(n)
      i = i + 1
    elseif id == glue then
      if n.width > 1001 then -- 1001 is arbitrary but sufficiently high to be bigger than most weird glue uses
        t[i] = ' '
        i = i + 1
      end
    elseif id == disc then
      i = real_extract_text(n.replace, t, i)
    elseif id == hlist then
      i = real_extract_text(n.head, t, i)
    elseif id == vlist then
      i = extract_text_vlist(n.head, t, i)
    end
  end
  return i
end
function extract_text_vlist(head, t, i) -- glue should not become a space here
  for n, id in traverse(head) do
    if id == hlist then
      i = real_extract_text(n.head, t, i)
    elseif id == vlist then
      i = extract_text_vlist(n.head, t, i)
    end
  end
  return i
end
return function(list)
  local t = {}
  real_extract_text(list.head, t, 1)
  return table.concat(t)
end

이는 일반 Lua 모듈로 사용할 수 있습니다.

\documentclass{article}
\usepackage{fontspec}

\newfontfamily{\devharf}{Noto Sans Devanagari}[Script=Devanagari, Renderer=HarfBuzz]
\newfontfamily{\devnode}{Noto Sans Devanagari}[Script=Devanagari, Renderer=Node]

\begin{document}

% Devanagari text is at the right end of following line
% of code, you might have to scroll right to read it
\setbox0=\hbox{Příliš žluťoučký \textit{kůň} úpěl \hbox{ďábelské} ódy difference diffierence. \devharf एक गांव -- में मोहन नाम का लड़का रहता था। उसके पिताजी एक मामूली मजदूर थे।}
\setbox1=\hbox{Příliš žluťoučký \textit{kůň} úpěl \hbox{ďábelské} ódy difference diffierence. \devnode एक गांव -- में मोहन नाम का लड़का रहता था। उसके पिताजी एक मामूली मजदूर थे।}

\directlua{
  local extracttext = require'extracttext'
  local f = io.open("hello.harf.txt","w") % Can reproduce the full input text
  f:write(extracttext(tex.getbox(0)))
  f:close()

  f = io.open("hello.node.txt","w") % In node mode, we only get an approximation
  f:write(extracttext(tex.getbox(1)))
  f:close()
}

\box0
\box1
\end{document}

보다 일반적인 참고 사항: 보시다시피, 모양이 지정된 목록에서 텍스트를 가져올 때, 특히 대리 쌍과 항목을 매핑해야 하는 ToUnicode 경우에는 몇 가지 작업이 포함됩니다. 이는 주로 모양이 있는 텍스트가~ 아니다그러한 용도로 사용됩니다. 글리프 노드가 보호되자마자(일명 subtype(n) >= 256 또는 not is_char(n)is true) .char항목에 더 이상 유니코드 값이 포함되지 않고 내부 식별자가 포함되며 .font항목은 더 이상 예상한 값이 아닐 수 있으며 일부 글리프가 표시되지 않을 수 있습니다. 전혀 글리프처럼. 텍스트의 시각적 표시뿐만 아니라 상자 뒤의 텍스트에 실제로 액세스하려는 대부분의 경우 실제로 목록을 가로채기를 원합니다.~ 전에처음에는 모양이 잡힙니다.

Question 2

tounicodeLuaotfload에서 HarfBuzz 글꼴 을 처리하는 방법에 대해서는 잘 모르지만 table.serialize. Harfbuzz에 맞게 조정된 원래 코드는 다음과 같습니다.

\documentclass{article}
\usepackage[lmargin=0.5in,tmargin=0.5in,rmargin=0.5in,bmargin=0.5in]{geometry}
\usepackage{fontspec}
\usepackage{microtype}
\usepackage{luacode}

%\newfontscript{Devanagari}{deva,dev2}
\newfontfamily{\devanagarifam}{Noto Sans Devanagari}[Script=Devanagari, Scale=1, Renderer=HarfBuzz]
\newfontfamily{\arabicfam}{Amiri}[Script=Arabic, Scale=1, Renderer=HarfBuzz]

\begin{document}

\setbox0=\hbox{Příliš žluťoučký \textit{kůň} úpěl \hbox{ďábelské} ódy difference diffierence. \devanagarifam एक गांव -- में मोहन नाम का लड़का रहता था। उसके पिताजी एक मामूली मजदूर थे।}
\setbox1=\hbox{\arabicfam \textdir TRT  هذه المقالة عن براغ. لتصفح عناوين مشابهة، انظر براغ (توضيح).}

\begin{luacode*}
  -- local fontstyles = require "l4fontstyles"
  local char = unicode.utf8.char
  local glyph_id = node.id("glyph")
  local glue_id  = node.id("glue")
  local hlist_id = node.id("hlist")
  local vlist_id = node.id("vlist")
  local disc_id  = node.id("disc")
  local minglue  = tex.sp("0.2em")
  local usedcharacters = {}
  local identifiers = fonts.hashes.identifiers
  local fontcache = {}

  local function to_unicode_chars(uchar)
    local uchar = uchar or ""
    -- put characters into a table
    local current = {}
    -- each codepoint is 4 bytes long, we loop over tounicode entry and cut it into 4 bytes chunks
    for i= 1, string.len(uchar), 4 do
      local cchar = string.sub(uchar, i, i + 3)
      -- codepoint is hex string, we need to convert it to number ad then to UTF8 char
      table.insert(current,char(tonumber(cchar,16)))
    end
    return current
  end
  -- cache character lookup, to speed up things
  local function get_character_from_cache(xchar, font_id)
    local current_font = fontcache[font_id] or {characters = {}}
    fontcache[font_id] = current_font -- initialize font cache for the current font if it doesn't exist
    return current_font.characters[xchar]
  end

  -- save characters to cache for faster lookup
  local function save_character_to_cache(xchar, font_id, replace)
    fontcache[font_id][xchar] = replace
    -- return value
    return replace
  end

  local function initialize_harfbuzz_cache(font_id, hb)
    -- save some harfbuzz tables for faster lookup
    local current_font = fontcache[font_id]
    -- the unicode data can be in two places
    -- 1. hb.shared.glyphs[glyphid].backmap
    current_font.glyphs = current_font.glyphs or hb.shared.glyphs
    -- 2. hb.shared.unicodes 
    -- it contains mapping between Unicode and glyph id
    -- we must create new table that contains reverse mapping
    if not current_font.backmap then 
      current_font.backmap = {} 
      for k,v in pairs(hb.shared.unicodes) do
        current_font.backmap[v] = k
      end
    end
    -- save it back to the font cache
    fontcache[font_id] = current_font
    return current_font.glyphs, current_font.backmap
  end

  local function get_unicode(xchar,font_id)
    -- try to load character from cache first
    local current_char = get_character_from_cache(xchar, font_id) 
    if current_char then return current_char end
    -- get tounicode for non HarfBuzz fonts
    local characters = identifiers[font_id].characters
    local uchar = characters[xchar].tounicode
    -- stop processing if tounicode exists
    if uchar then return save_character_to_cache(xchar, font_id, to_unicode_chars(uchar)) end
    -- detect if font is processed by Harfbuzz
    local hb = identifiers[font_id].hb
    -- try HarfBuzz data
    if not uchar and hb then 
      -- get glyph index of the character
      local index = characters[xchar].index
      -- load HarfBuzz tables from cache
      local glyphs, backmap = initialize_harfbuzz_cache(font_id, hb)
      -- get tounicode field from HarfBuzz glyph info
      local tounicode = glyphs[index].tounicode
      if tounicode then
        return save_character_to_cache(xchar, font_id, to_unicode_chars(tounicode))
      end
      -- if this fails, try backmap, which contains mapping between glyph index and Unicode
      local backuni = backmap[index]
      if backuni then 
        return save_character_to_cache(xchar, font_id, {char(backuni)})
      end
      -- if this fails too, discard this character
      return save_character_to_cache(xchar, font_id, {})
    end
    -- return just the original char if everything else fails
    return save_character_to_cache(xchar, font_id, {char(xchar)})
  end

  local function nodeText(n)
    -- output buffer
    local t =  {}
    for x in node.traverse(n) do
      -- glyph node
      if x.id == glyph_id then
        -- get table with characters for current node.char
        local chars = get_unicode(x.char,x.font)
        for _, current_char in ipairs(chars) do
          -- save characters to the output buffer
          table.insert(t,current_char)
        end
      -- glue node
      elseif x.id == glue_id and  node.getglue(x) > minglue then
        table.insert(t," ")
      -- discretionaries
      elseif x.id == disc_id then
        table.insert(t, nodeText(x.replace))
      -- recursivelly process hlist and vlist nodes
      elseif x.id == hlist_id or x.id == vlist_id then
        table.insert(t,nodeText(x.head))
      end
    end
    return table.concat(t)
  end
  local n = tex.getbox(0)
  local n1 = tex.getbox(1)
  print(nodeText(n.head))
  local f = io.open("hello.txt","w")
  f:write(nodeText(n.head))
  f:write(nodeText(n1.head))
  f:close()
\end{luacode*}

\box0

\box1
\end{document}

다음의 아랍어 샘플도 추가했습니다.위키피디아. 내용은 다음과 같습니다 hello.txt.

Příliš žluťoučký kůň úpěl ďábelské ódy difference diffierence. एक गांव -- में मोहन नाम का लड़का रहता था। उसके पताजी एक मामूली मजदूर थे।هذه المقالة عن براغ. لتصفح عناوين مشابهة، انظر براغ (توضيح).

두 가지 중요한 기능은 다음과 같습니다

  local function to_unicode_chars(uchar)
    local uchar = uchar or ""
    local current = {}
    for i= 1, string.len(uchar), 4 do
      local cchar = string.sub(uchar, i, i + 3)
      table.insert(current,char(tonumber(cchar,16)))
    end
    return current
  end

to_unicode_chars함수는 to_unicode 항목을 4바이트 청크로 분할한 다음 UTF 8 문자로 변환합니다. 항목 없이 문자 모양을 처리할 수도 있습니다 tounicode. 이 경우 빈 문자열만 반환합니다.

  local function get_unicode(xchar,font_id)
    -- try to load character from cache first
    local current_char = get_character_from_cache(xchar, font_id) 
    if current_char then return current_char end
    -- get tounicode for non HarfBuzz fonts
    local characters = identifiers[font_id].characters
    local uchar = characters[xchar].tounicode
    -- stop processing if tounicode exists
    if uchar then return save_character_to_cache(xchar, font_id, to_unicode_chars(uchar)) end
    -- detect if font is processed by Harfbuzz
    local hb = identifiers[font_id].hb
    -- try HarfBuzz data
    if not uchar and hb then 
      -- get glyph index of the character
      local index = characters[xchar].index
      -- load HarfBuzz tables from cache
      local glyphs, backmap = initialize_harfbuzz_cache(font_id, hb)
      -- get tounicode field from HarfBuzz glyph info
      local tounicode = glyphs[index].tounicode
      if tounicode then
        return save_character_to_cache(xchar, font_id, to_unicode_chars(tounicode))
      end
      -- if this fails, try backmap, which contains mapping between glyph index and Unicode
      local backuni = backmap[index]
      if backuni then 
        return save_character_to_cache(xchar, font_id, {char(backuni)})
      end
      -- if this fails too, discard this character
      return save_character_to_cache(xchar, font_id, {})
    end
    -- return just the original char if everything else fails
    return save_character_to_cache(xchar, font_id, {char(xchar)})
  end

이 함수는 먼저 현재 글꼴 정보에서 유니오코드 데이터를 로드하려고 시도합니다. 실패하면 Harfbuzz 테이블에서 조회를 시도합니다. 대부분의 문자는 테이블 tounicode에 매핑 되어 있습니다 glyphs. 사용할 수 없는 경우 unicodes문자 모양 인덱스와 유니코드 간의 매핑이 포함된 테이블을 시도합니다. 그래도 실패하면 이 캐릭터를 버립니다.

Answer