XeLaTeX, LuaLaTeX, 글꼴 사양, 유니코드 및 정규화

Question

저는 XeTeX를 사용하지 않기 때문에 처음 두 질문에 대한 답을 모르지만 세 번째 질문에 대한 옵션을 제공하고 싶습니다.

덕분에아서의 코드LuaLaTeX에서 유니코드 정규화를 위한 기본 패키지를 만들 수 있었습니다. 현재 LuaTeX에서 작동하려면 코드를 약간만 수정하면 됩니다. 여기에는 기본 Lua 파일만 게시하겠습니다. 전체 프로젝트는 다음에서 볼 수 있습니다.비정규화로서의 Github.

샘플 사용법:

\documentclass{article}
\usepackage{fontspec}
\usepackage[czech]{babel}
\setmainfont{Linux Libertine O}
\usepackage[nodes,buffer=false, debug]{uninormalize}
\begin{document}

Some tests:
\begin{itemize}
  \item combined letter ᾳ %GREEK SMALL LETTER ALPHA (U+03B1) + COMBINING GREEK YPOGEGRAMMENI (U+0345)
  \item normal letter ᾳ% GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI (U+1FB3)
\end{itemize}

Some more combined and normal letters: 
óóōōöö

Linux Libertine does support some combined chars: \parbox{4em}{příliš}
\end{document}

(이 파일의 올바른 버전은 Github에 있으며, 이 예에서는 결합된 문자가 잘못 전송되었습니다.)

패키지의 주요 아이디어는 다음과 같습니다. 입력을 처리하고 문자 뒤에 결합된 표시가 발견되면 정규화된 NFC 형식으로 대체됩니다. 두 가지 방법이 제공됩니다. 첫 번째 접근 방식은 노드 처리 콜백을 사용하여 분해된 문자 모양을 정규화된 문자로 바꾸는 것이었습니다. 이는 노드 속성을 사용하여 어디에서나 처리를 켜고 끌 수 있다는 장점이 있습니다. 다른 가능한 기능은 현재 글꼴에 정규화된 문자가 포함되어 있는지 확인하고 그렇지 않은 경우 원본 형식을 사용하는 것입니다. 불행하게도 내 테스트에서는 일부 문자에서는 실패합니다. 특히 정규화 후 올바른 문자를 생성하지 않는 대신 í노드에 구성된 문자가 대신 사용됩니다. 그러나 이것은 악센트 배치가 잘못된 결과를 생성합니다. 따라서 이 방법은 약간의 수정이 필요하거나 완전히 잘못된 방법입니다.dotless i + ´i + ´

따라서 다른 방법은 process_input_buffer콜백을 사용하여 디스크에서 입력 파일을 읽을 때 입력 파일을 정규화하는 것입니다. 이 방법은 글꼴의 정보를 사용하는 것을 허용하지 않으며 줄 중간에서 끄는 것도 허용하지 않지만 구현하기가 훨씬 쉽습니다. 콜백 함수는 다음과 같습니다.

function buffer_callback(line) 
  return NFC(line)
end

이는 노드 처리 버전에 3일을 소비한 후에 정말 좋은 결과를 얻었습니다.

호기심을 위해 이것은 Lua 패키지입니다:

local M = {}
dofile("unicode-names.lua")
dofile('unicode-normalization.lua')
local NFC = unicode.conformance.toNFC
local char = unicode.utf8.char
local gmatch = unicode.utf8.gmatch
local name = unicode.conformance.name
local byte = unicode.utf8.byte
local unidata = characters.data
local length = unicode.utf8.len

M.debug = false

-- for some reason variable number of arguments doesn't work
local function debug_msg(a,b,c,d,e,f,g,h,i)
  if M.debug then
    local t = {a,b,c,d,e,f,g,h,i}
    print("[uninormalize]", unpack(t))
  end
end

local function make_hash (t) 
  local y = {}
  for _,v in ipairs(t) do 
    y[v] = true
  end
  return y
end

local letter_categories = make_hash {"lu","ll","lt","lo","lm"}

local mark_categories = make_hash {"mn","mc","me"}

local function printchars(s)
    local t = {}
    for x in gmatch(s,".") do
        t[#t+1] = name(byte(x))
    end
    debug_msg("characters",table.concat(t,":"))
end

local categories = {}


local function get_category(charcode)
  local charcode = charcode or ""
  if categories[charcode] then
    return categories[charcode] 
  else
    local unidatacode = unidata[charcode] or {}
    local category = unidatacode.category
    categories[charcode] = category
    return category
  end
end

-- get glyph char and category
local function glyph_info(n)
  local char = n.char
  return char, get_category(char)
end

local function get_mark(n)
  if n.id == 37 then
    local character, cat = glyph_info(n)
    if mark_categories[cat] then
      return char(character)
    end
  end
  return false
end

local function make_glyphs(head, nextn,s, lang, font, subtype) 
  local g = function(a) 
    local new_n = node.new(37, subtype)
    new_n.lang = lang
    new_n.font = font
    new_n.char = byte(a)
    return new_n
  end
  if length(s) == 1 then
    return node.insert_before(head, nextn,g(s))
  else
    local t = {}
    local first = true
    for x in gmatch(s,".") do
      debug_msg("multi letter",x)
        head, newn = node.insert_before(head, nextn, g(x))
    end
    return head
  end
end

local function normalize_marks(head, n)
  local lang, font, subtype = n.lang, n.font, n.subtype
  local text = {}
  text[#text+1] = char(n.char)
  local head, nextn = node.remove(head, n)
  --local nextn = n.next
  local info = get_mark(nextn)
  while(info) do
    text[#text+1] = info
    head, nextn = node.remove(head,nextn)
    info = get_mark(nextn)
  end
  local s = NFC(table.concat(text))
  debug_msg("We've got mark: " .. s)
  local new_n = node.new(37, subtype)
  new_n.lang = lang
  new_n.font = font
  new_n.char = byte(s)
  --head, new_n = node.insert_before(head, nextn, new_n)
  -- head, new_n = node.insert_before(head, nextn, make_glyphs(s, lang, font, subtype))
  head, new_n = make_glyphs(head, nextn, s, lang, font, subtype)
  local t = {}
  for x in node.traverse_id(37,head) do
    t[#t+1] = char(x.char)
  end
  debug_msg("Variables ", table.concat(t,":"), table.concat(text,";"), char(byte(s)),length(s))
  return head, nextn
end

local function normalize_glyphs(head, n)
  --local charcode = n.char
  --local category = get_category(charcode)
  local charcode, category = glyph_info(n)
  if letter_categories[category] then 
    local nextn = n.next
    if nextn.id == 37 then
      --local nextchar = nextn.char
      --local nextcat = get_category(nextchar)
      local nextchar, nextcat = glyph_info(nextn)
      if mark_categories[nextcat] then
        return normalize_marks(head,n)
      end
    end
  end
  return head, n.next 
end


function M.nodes(head)
    local t = {}
    local text = false
  local n = head
    -- for n in node.traverse(head) do
  while n do
        if n.id == 37 then
      local charcode = n.char
            debug_msg("unicode name",name(charcode))
            debug_msg("character category",get_category(charcode))
            t[#t+1]= char(charcode)
            text = true
      head, n = normalize_glyphs(head, n)
        else
            if text then
                local s = table.concat(t)
                debug_msg("text chunk",s)
                --printchars(NFC(s))
                debug_msg("----------")
            end
            text = false
            t = {}
      n = n.next
        end
    end
    return head
end

--[[
-- These functions aren't needed when processing buffer. We can call NFC on the whole input line
local unibytes = {}

local function get_charcategory(s)
  local s = s or ""
  local b = unibytes[s] or byte(s) or 0
  unibytes[s] = b
  return get_category(b)
end

local function normalize_charmarks(t,i)
  local c = {t[i]}
  local i = i + 1
  local s = get_charcategory(t[i])
  while mark_categories[s] do
    c[#c+1] = t[i]
    i = i + 1
    s = get_charcategory(t[i])
  end
  return NFC(table.concat(c)), i
end

local function normalize_char(t,i)
  local ch = t[i]
  local c = get_charcategory(ch)
  if letter_categories[c] then
    local nextc = get_charcategory(t[i+1])
    if mark_categories[nextc] then
      return normalize_charmarks(t,i)
    end
  end
  return ch, i+1
end
-- ]]
function M.buffer(line)
  --[[
  local t = {}
  local new_t = {}
  -- we need to make table witl all uni chars on the line
  for x in gmatch(line,".") do
    t[#t+1] = x
  end
  local i = 1
  -- normalize next char
  local c, i = normalize_char(t, i)
  new_t[#new_t+1] = c
  while t[i] do
    c, i = normalize_char(t,i)
    -- local  c = t[i]
    -- i =  i + 1
    new_t[#new_t+1] = c
  end
  return table.concat(new_t)
  --]]
  return NFC(line)
end


return M

이제 사진을 찍을 시간입니다.

정규화 없이:

여기에 이미지 설명을 입력하세요

구성된 그리스어 문자가 잘못되었음을 알 수 있습니다. 다른 조합은 Linux Libertine에서 지원됩니다.

노드 정규화:

여기에 이미지 설명을 입력하세요

그리스 문자는 정확하지만 í처음에는 příliš틀렸습니다. 이것이 제가 얘기했던 문제입니다.

이제 버퍼 정규화:

여기에 이미지 설명을 입력하세요

이제 모든 게 괜찮아

Answer 1

저는 XeTeX를 사용하지 않기 때문에 처음 두 질문에 대한 답을 모르지만 세 번째 질문에 대한 옵션을 제공하고 싶습니다.

덕분에아서의 코드LuaLaTeX에서 유니코드 정규화를 위한 기본 패키지를 만들 수 있었습니다. 현재 LuaTeX에서 작동하려면 코드를 약간만 수정하면 됩니다. 여기에는 기본 Lua 파일만 게시하겠습니다. 전체 프로젝트는 다음에서 볼 수 있습니다.비정규화로서의 Github.

샘플 사용법:

\documentclass{article}
\usepackage{fontspec}
\usepackage[czech]{babel}
\setmainfont{Linux Libertine O}
\usepackage[nodes,buffer=false, debug]{uninormalize}
\begin{document}

Some tests:
\begin{itemize}
  \item combined letter ᾳ %GREEK SMALL LETTER ALPHA (U+03B1) + COMBINING GREEK YPOGEGRAMMENI (U+0345)
  \item normal letter ᾳ% GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI (U+1FB3)
\end{itemize}

Some more combined and normal letters: 
óóōōöö

Linux Libertine does support some combined chars: \parbox{4em}{příliš}
\end{document}

(이 파일의 올바른 버전은 Github에 있으며, 이 예에서는 결합된 문자가 잘못 전송되었습니다.)

패키지의 주요 아이디어는 다음과 같습니다. 입력을 처리하고 문자 뒤에 결합된 표시가 발견되면 정규화된 NFC 형식으로 대체됩니다. 두 가지 방법이 제공됩니다. 첫 번째 접근 방식은 노드 처리 콜백을 사용하여 분해된 문자 모양을 정규화된 문자로 바꾸는 것이었습니다. 이는 노드 속성을 사용하여 어디에서나 처리를 켜고 끌 수 있다는 장점이 있습니다. 다른 가능한 기능은 현재 글꼴에 정규화된 문자가 포함되어 있는지 확인하고 그렇지 않은 경우 원본 형식을 사용하는 것입니다. 불행하게도 내 테스트에서는 일부 문자에서는 실패합니다. 특히 정규화 후 올바른 문자를 생성하지 않는 대신 í노드에 구성된 문자가 대신 사용됩니다. 그러나 이것은 악센트 배치가 잘못된 결과를 생성합니다. 따라서 이 방법은 약간의 수정이 필요하거나 완전히 잘못된 방법입니다.dotless i + ´i + ´

따라서 다른 방법은 process_input_buffer콜백을 사용하여 디스크에서 입력 파일을 읽을 때 입력 파일을 정규화하는 것입니다. 이 방법은 글꼴의 정보를 사용하는 것을 허용하지 않으며 줄 중간에서 끄는 것도 허용하지 않지만 구현하기가 훨씬 쉽습니다. 콜백 함수는 다음과 같습니다.

function buffer_callback(line) 
  return NFC(line)
end

이는 노드 처리 버전에 3일을 소비한 후에 정말 좋은 결과를 얻었습니다.

호기심을 위해 이것은 Lua 패키지입니다:

local M = {}
dofile("unicode-names.lua")
dofile('unicode-normalization.lua')
local NFC = unicode.conformance.toNFC
local char = unicode.utf8.char
local gmatch = unicode.utf8.gmatch
local name = unicode.conformance.name
local byte = unicode.utf8.byte
local unidata = characters.data
local length = unicode.utf8.len

M.debug = false

-- for some reason variable number of arguments doesn't work
local function debug_msg(a,b,c,d,e,f,g,h,i)
  if M.debug then
    local t = {a,b,c,d,e,f,g,h,i}
    print("[uninormalize]", unpack(t))
  end
end

local function make_hash (t) 
  local y = {}
  for _,v in ipairs(t) do 
    y[v] = true
  end
  return y
end

local letter_categories = make_hash {"lu","ll","lt","lo","lm"}

local mark_categories = make_hash {"mn","mc","me"}

local function printchars(s)
    local t = {}
    for x in gmatch(s,".") do
        t[#t+1] = name(byte(x))
    end
    debug_msg("characters",table.concat(t,":"))
end

local categories = {}


local function get_category(charcode)
  local charcode = charcode or ""
  if categories[charcode] then
    return categories[charcode] 
  else
    local unidatacode = unidata[charcode] or {}
    local category = unidatacode.category
    categories[charcode] = category
    return category
  end
end

-- get glyph char and category
local function glyph_info(n)
  local char = n.char
  return char, get_category(char)
end

local function get_mark(n)
  if n.id == 37 then
    local character, cat = glyph_info(n)
    if mark_categories[cat] then
      return char(character)
    end
  end
  return false
end

local function make_glyphs(head, nextn,s, lang, font, subtype) 
  local g = function(a) 
    local new_n = node.new(37, subtype)
    new_n.lang = lang
    new_n.font = font
    new_n.char = byte(a)
    return new_n
  end
  if length(s) == 1 then
    return node.insert_before(head, nextn,g(s))
  else
    local t = {}
    local first = true
    for x in gmatch(s,".") do
      debug_msg("multi letter",x)
        head, newn = node.insert_before(head, nextn, g(x))
    end
    return head
  end
end

local function normalize_marks(head, n)
  local lang, font, subtype = n.lang, n.font, n.subtype
  local text = {}
  text[#text+1] = char(n.char)
  local head, nextn = node.remove(head, n)
  --local nextn = n.next
  local info = get_mark(nextn)
  while(info) do
    text[#text+1] = info
    head, nextn = node.remove(head,nextn)
    info = get_mark(nextn)
  end
  local s = NFC(table.concat(text))
  debug_msg("We've got mark: " .. s)
  local new_n = node.new(37, subtype)
  new_n.lang = lang
  new_n.font = font
  new_n.char = byte(s)
  --head, new_n = node.insert_before(head, nextn, new_n)
  -- head, new_n = node.insert_before(head, nextn, make_glyphs(s, lang, font, subtype))
  head, new_n = make_glyphs(head, nextn, s, lang, font, subtype)
  local t = {}
  for x in node.traverse_id(37,head) do
    t[#t+1] = char(x.char)
  end
  debug_msg("Variables ", table.concat(t,":"), table.concat(text,";"), char(byte(s)),length(s))
  return head, nextn
end

local function normalize_glyphs(head, n)
  --local charcode = n.char
  --local category = get_category(charcode)
  local charcode, category = glyph_info(n)
  if letter_categories[category] then 
    local nextn = n.next
    if nextn.id == 37 then
      --local nextchar = nextn.char
      --local nextcat = get_category(nextchar)
      local nextchar, nextcat = glyph_info(nextn)
      if mark_categories[nextcat] then
        return normalize_marks(head,n)
      end
    end
  end
  return head, n.next 
end


function M.nodes(head)
    local t = {}
    local text = false
  local n = head
    -- for n in node.traverse(head) do
  while n do
        if n.id == 37 then
      local charcode = n.char
            debug_msg("unicode name",name(charcode))
            debug_msg("character category",get_category(charcode))
            t[#t+1]= char(charcode)
            text = true
      head, n = normalize_glyphs(head, n)
        else
            if text then
                local s = table.concat(t)
                debug_msg("text chunk",s)
                --printchars(NFC(s))
                debug_msg("----------")
            end
            text = false
            t = {}
      n = n.next
        end
    end
    return head
end

--[[
-- These functions aren't needed when processing buffer. We can call NFC on the whole input line
local unibytes = {}

local function get_charcategory(s)
  local s = s or ""
  local b = unibytes[s] or byte(s) or 0
  unibytes[s] = b
  return get_category(b)
end

local function normalize_charmarks(t,i)
  local c = {t[i]}
  local i = i + 1
  local s = get_charcategory(t[i])
  while mark_categories[s] do
    c[#c+1] = t[i]
    i = i + 1
    s = get_charcategory(t[i])
  end
  return NFC(table.concat(c)), i
end

local function normalize_char(t,i)
  local ch = t[i]
  local c = get_charcategory(ch)
  if letter_categories[c] then
    local nextc = get_charcategory(t[i+1])
    if mark_categories[nextc] then
      return normalize_charmarks(t,i)
    end
  end
  return ch, i+1
end
-- ]]
function M.buffer(line)
  --[[
  local t = {}
  local new_t = {}
  -- we need to make table witl all uni chars on the line
  for x in gmatch(line,".") do
    t[#t+1] = x
  end
  local i = 1
  -- normalize next char
  local c, i = normalize_char(t, i)
  new_t[#new_t+1] = c
  while t[i] do
    c, i = normalize_char(t,i)
    -- local  c = t[i]
    -- i =  i + 1
    new_t[#new_t+1] = c
  end
  return table.concat(new_t)
  --]]
  return NFC(line)
end


return M

이제 사진을 찍을 시간입니다.

정규화 없이:

여기에 이미지 설명을 입력하세요

구성된 그리스어 문자가 잘못되었음을 알 수 있습니다. 다른 조합은 Linux Libertine에서 지원됩니다.

노드 정규화:

여기에 이미지 설명을 입력하세요

그리스 문자는 정확하지만 í처음에는 příliš틀렸습니다. 이것이 제가 얘기했던 문제입니다.

이제 버퍼 정규화:

여기에 이미지 설명을 입력하세요

이제 모든 게 괜찮아

XeLaTeX, LuaLaTeX, 글꼴 사양, 유니코드 및 정규화

답변1

관련 정보