XeLaTeX, LuaLaTeX, Fontspec, Unicode und Normalisierung

Question

Ich kenne die Antwort auf die ersten beiden Fragen nicht, da ich XeTeX nicht verwende, aber ich möchte eine Option für die dritte Frage anbieten.

Dank anArthurs CodeIch konnte ein Basispaket für die Unicode-Normalisierung in LuaLaTeX erstellen. Der Code musste nur geringfügig geändert werden, um mit dem aktuellen LuaTeX zu funktionieren. Ich werde hier nur die Haupt-Lua-Datei veröffentlichen, das vollständige Projekt ist verfügbar unterGithub als uninormalize.

Beispielverwendung:

\documentclass{article}
\usepackage{fontspec}
\usepackage[czech]{babel}
\setmainfont{Linux Libertine O}
\usepackage[nodes,buffer=false, debug]{uninormalize}
\begin{document}

Some tests:
\begin{itemize}
  \item combined letter ᾳ %GREEK SMALL LETTER ALPHA (U+03B1) + COMBINING GREEK YPOGEGRAMMENI (U+0345)
  \item normal letter ᾳ% GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI (U+1FB3)
\end{itemize}

Some more combined and normal letters: 
óóōōöö

Linux Libertine does support some combined chars: \parbox{4em}{příliš}
\end{document}

(Beachten Sie, dass die richtige Version dieser Datei auf Github ist, kombinierte Buchstaben wurden in diesem Beispiel falsch übertragen)

Die Hauptidee des Pakets ist folgende: Verarbeiten Sie die Eingabe, und wenn ein Buchstabe gefolgt von kombinierten Zeichen gefunden wird, wird dieser durch die normalisierte NFC-Form ersetzt. Es stehen zwei Methoden zur Verfügung. Mein erster Ansatz bestand darin, Knotenverarbeitungs-Rückrufe zu verwenden, um zerlegte Glyphen durch normalisierte Zeichen zu ersetzen. Dies hätte den Vorteil, dass die Verarbeitung mithilfe von Knotenattributen überall ein- und ausgeschaltet werden könnte. Die andere mögliche Funktion könnte darin bestehen, zu prüfen, ob die aktuelle Schriftart normalisierte Zeichen enthält, und andernfalls die Originalform zu verwenden. Leider schlägt dies in meinen Tests bei einigen Zeichen fehl, insbesondere ísteht „zusammengesetzt“ in den Knoten als dotless i + ´, statt als i + ´, was nach der Normalisierung nicht das richtige Zeichen ergibt, sodass stattdessen „zusammengesetzte Zeichen“ verwendet werden. Dies erzeugt jedoch eine Ausgabe mit falscher Akzentplatzierung. Diese Methode muss also entweder korrigiert werden oder ist völlig falsch.

Die andere Methode besteht darin, Callback zu verwenden, process_input_bufferum die Eingabedatei beim Lesen von der Festplatte zu normalisieren. Diese Methode erlaubt weder die Verwendung von Informationen aus Schriftarten noch das Ausschalten mitten in der Zeile, ist aber wesentlich einfacher zu implementieren. Die Callback-Funktion kann folgendermaßen aussehen:

function buffer_callback(line) 
  return NFC(line)
end

was ein wirklich schönes Ergebnis ist, nachdem man drei Tage mit der Version der Knotenverarbeitung verbracht hat.

Aus Neugier, dies ist das Lua-Paket:

local M = {}
dofile("unicode-names.lua")
dofile('unicode-normalization.lua')
local NFC = unicode.conformance.toNFC
local char = unicode.utf8.char
local gmatch = unicode.utf8.gmatch
local name = unicode.conformance.name
local byte = unicode.utf8.byte
local unidata = characters.data
local length = unicode.utf8.len

M.debug = false

-- for some reason variable number of arguments doesn't work
local function debug_msg(a,b,c,d,e,f,g,h,i)
  if M.debug then
    local t = {a,b,c,d,e,f,g,h,i}
    print("[uninormalize]", unpack(t))
  end
end

local function make_hash (t) 
  local y = {}
  for _,v in ipairs(t) do 
    y[v] = true
  end
  return y
end

local letter_categories = make_hash {"lu","ll","lt","lo","lm"}

local mark_categories = make_hash {"mn","mc","me"}

local function printchars(s)
    local t = {}
    for x in gmatch(s,".") do
        t[#t+1] = name(byte(x))
    end
    debug_msg("characters",table.concat(t,":"))
end

local categories = {}


local function get_category(charcode)
  local charcode = charcode or ""
  if categories[charcode] then
    return categories[charcode] 
  else
    local unidatacode = unidata[charcode] or {}
    local category = unidatacode.category
    categories[charcode] = category
    return category
  end
end

-- get glyph char and category
local function glyph_info(n)
  local char = n.char
  return char, get_category(char)
end

local function get_mark(n)
  if n.id == 37 then
    local character, cat = glyph_info(n)
    if mark_categories[cat] then
      return char(character)
    end
  end
  return false
end

local function make_glyphs(head, nextn,s, lang, font, subtype) 
  local g = function(a) 
    local new_n = node.new(37, subtype)
    new_n.lang = lang
    new_n.font = font
    new_n.char = byte(a)
    return new_n
  end
  if length(s) == 1 then
    return node.insert_before(head, nextn,g(s))
  else
    local t = {}
    local first = true
    for x in gmatch(s,".") do
      debug_msg("multi letter",x)
        head, newn = node.insert_before(head, nextn, g(x))
    end
    return head
  end
end

local function normalize_marks(head, n)
  local lang, font, subtype = n.lang, n.font, n.subtype
  local text = {}
  text[#text+1] = char(n.char)
  local head, nextn = node.remove(head, n)
  --local nextn = n.next
  local info = get_mark(nextn)
  while(info) do
    text[#text+1] = info
    head, nextn = node.remove(head,nextn)
    info = get_mark(nextn)
  end
  local s = NFC(table.concat(text))
  debug_msg("We've got mark: " .. s)
  local new_n = node.new(37, subtype)
  new_n.lang = lang
  new_n.font = font
  new_n.char = byte(s)
  --head, new_n = node.insert_before(head, nextn, new_n)
  -- head, new_n = node.insert_before(head, nextn, make_glyphs(s, lang, font, subtype))
  head, new_n = make_glyphs(head, nextn, s, lang, font, subtype)
  local t = {}
  for x in node.traverse_id(37,head) do
    t[#t+1] = char(x.char)
  end
  debug_msg("Variables ", table.concat(t,":"), table.concat(text,";"), char(byte(s)),length(s))
  return head, nextn
end

local function normalize_glyphs(head, n)
  --local charcode = n.char
  --local category = get_category(charcode)
  local charcode, category = glyph_info(n)
  if letter_categories[category] then 
    local nextn = n.next
    if nextn.id == 37 then
      --local nextchar = nextn.char
      --local nextcat = get_category(nextchar)
      local nextchar, nextcat = glyph_info(nextn)
      if mark_categories[nextcat] then
        return normalize_marks(head,n)
      end
    end
  end
  return head, n.next 
end


function M.nodes(head)
    local t = {}
    local text = false
  local n = head
    -- for n in node.traverse(head) do
  while n do
        if n.id == 37 then
      local charcode = n.char
            debug_msg("unicode name",name(charcode))
            debug_msg("character category",get_category(charcode))
            t[#t+1]= char(charcode)
            text = true
      head, n = normalize_glyphs(head, n)
        else
            if text then
                local s = table.concat(t)
                debug_msg("text chunk",s)
                --printchars(NFC(s))
                debug_msg("----------")
            end
            text = false
            t = {}
      n = n.next
        end
    end
    return head
end

--[[
-- These functions aren't needed when processing buffer. We can call NFC on the whole input line
local unibytes = {}

local function get_charcategory(s)
  local s = s or ""
  local b = unibytes[s] or byte(s) or 0
  unibytes[s] = b
  return get_category(b)
end

local function normalize_charmarks(t,i)
  local c = {t[i]}
  local i = i + 1
  local s = get_charcategory(t[i])
  while mark_categories[s] do
    c[#c+1] = t[i]
    i = i + 1
    s = get_charcategory(t[i])
  end
  return NFC(table.concat(c)), i
end

local function normalize_char(t,i)
  local ch = t[i]
  local c = get_charcategory(ch)
  if letter_categories[c] then
    local nextc = get_charcategory(t[i+1])
    if mark_categories[nextc] then
      return normalize_charmarks(t,i)
    end
  end
  return ch, i+1
end
-- ]]
function M.buffer(line)
  --[[
  local t = {}
  local new_t = {}
  -- we need to make table witl all uni chars on the line
  for x in gmatch(line,".") do
    t[#t+1] = x
  end
  local i = 1
  -- normalize next char
  local c, i = normalize_char(t, i)
  new_t[#new_t+1] = c
  while t[i] do
    c, i = normalize_char(t,i)
    -- local  c = t[i]
    -- i =  i + 1
    new_t[#new_t+1] = c
  end
  return table.concat(new_t)
  --]]
  return NFC(line)
end


return M

und jetzt ist es Zeit für ein paar Bilder.

ohne Normalisierung:

Bildbeschreibung hier eingeben

Sie können sehen, dass zusammengesetzte griechische Zeichen falsch sind, andere Kombinationen werden von Linux Libertine unterstützt

mit Knotennormalisierung:

Bildbeschreibung hier eingeben

Griechische Buchstaben sind richtig, aber „ íin first“ přílišist falsch. Das ist das Problem, von dem ich gesprochen habe.

und nun die Puffernormalisierung:

Bildbeschreibung hier eingeben

alles ist jetzt in Ordnung

Answer 1

Ich kenne die Antwort auf die ersten beiden Fragen nicht, da ich XeTeX nicht verwende, aber ich möchte eine Option für die dritte Frage anbieten.

Dank anArthurs CodeIch konnte ein Basispaket für die Unicode-Normalisierung in LuaLaTeX erstellen. Der Code musste nur geringfügig geändert werden, um mit dem aktuellen LuaTeX zu funktionieren. Ich werde hier nur die Haupt-Lua-Datei veröffentlichen, das vollständige Projekt ist verfügbar unterGithub als uninormalize.

Beispielverwendung:

\documentclass{article}
\usepackage{fontspec}
\usepackage[czech]{babel}
\setmainfont{Linux Libertine O}
\usepackage[nodes,buffer=false, debug]{uninormalize}
\begin{document}

Some tests:
\begin{itemize}
  \item combined letter ᾳ %GREEK SMALL LETTER ALPHA (U+03B1) + COMBINING GREEK YPOGEGRAMMENI (U+0345)
  \item normal letter ᾳ% GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI (U+1FB3)
\end{itemize}

Some more combined and normal letters: 
óóōōöö

Linux Libertine does support some combined chars: \parbox{4em}{příliš}
\end{document}

(Beachten Sie, dass die richtige Version dieser Datei auf Github ist, kombinierte Buchstaben wurden in diesem Beispiel falsch übertragen)

Die Hauptidee des Pakets ist folgende: Verarbeiten Sie die Eingabe, und wenn ein Buchstabe gefolgt von kombinierten Zeichen gefunden wird, wird dieser durch die normalisierte NFC-Form ersetzt. Es stehen zwei Methoden zur Verfügung. Mein erster Ansatz bestand darin, Knotenverarbeitungs-Rückrufe zu verwenden, um zerlegte Glyphen durch normalisierte Zeichen zu ersetzen. Dies hätte den Vorteil, dass die Verarbeitung mithilfe von Knotenattributen überall ein- und ausgeschaltet werden könnte. Die andere mögliche Funktion könnte darin bestehen, zu prüfen, ob die aktuelle Schriftart normalisierte Zeichen enthält, und andernfalls die Originalform zu verwenden. Leider schlägt dies in meinen Tests bei einigen Zeichen fehl, insbesondere ísteht „zusammengesetzt“ in den Knoten als dotless i + ´, statt als i + ´, was nach der Normalisierung nicht das richtige Zeichen ergibt, sodass stattdessen „zusammengesetzte Zeichen“ verwendet werden. Dies erzeugt jedoch eine Ausgabe mit falscher Akzentplatzierung. Diese Methode muss also entweder korrigiert werden oder ist völlig falsch.

Die andere Methode besteht darin, Callback zu verwenden, process_input_bufferum die Eingabedatei beim Lesen von der Festplatte zu normalisieren. Diese Methode erlaubt weder die Verwendung von Informationen aus Schriftarten noch das Ausschalten mitten in der Zeile, ist aber wesentlich einfacher zu implementieren. Die Callback-Funktion kann folgendermaßen aussehen:

function buffer_callback(line) 
  return NFC(line)
end

was ein wirklich schönes Ergebnis ist, nachdem man drei Tage mit der Version der Knotenverarbeitung verbracht hat.

Aus Neugier, dies ist das Lua-Paket:

local M = {}
dofile("unicode-names.lua")
dofile('unicode-normalization.lua')
local NFC = unicode.conformance.toNFC
local char = unicode.utf8.char
local gmatch = unicode.utf8.gmatch
local name = unicode.conformance.name
local byte = unicode.utf8.byte
local unidata = characters.data
local length = unicode.utf8.len

M.debug = false

-- for some reason variable number of arguments doesn't work
local function debug_msg(a,b,c,d,e,f,g,h,i)
  if M.debug then
    local t = {a,b,c,d,e,f,g,h,i}
    print("[uninormalize]", unpack(t))
  end
end

local function make_hash (t) 
  local y = {}
  for _,v in ipairs(t) do 
    y[v] = true
  end
  return y
end

local letter_categories = make_hash {"lu","ll","lt","lo","lm"}

local mark_categories = make_hash {"mn","mc","me"}

local function printchars(s)
    local t = {}
    for x in gmatch(s,".") do
        t[#t+1] = name(byte(x))
    end
    debug_msg("characters",table.concat(t,":"))
end

local categories = {}


local function get_category(charcode)
  local charcode = charcode or ""
  if categories[charcode] then
    return categories[charcode] 
  else
    local unidatacode = unidata[charcode] or {}
    local category = unidatacode.category
    categories[charcode] = category
    return category
  end
end

-- get glyph char and category
local function glyph_info(n)
  local char = n.char
  return char, get_category(char)
end

local function get_mark(n)
  if n.id == 37 then
    local character, cat = glyph_info(n)
    if mark_categories[cat] then
      return char(character)
    end
  end
  return false
end

local function make_glyphs(head, nextn,s, lang, font, subtype) 
  local g = function(a) 
    local new_n = node.new(37, subtype)
    new_n.lang = lang
    new_n.font = font
    new_n.char = byte(a)
    return new_n
  end
  if length(s) == 1 then
    return node.insert_before(head, nextn,g(s))
  else
    local t = {}
    local first = true
    for x in gmatch(s,".") do
      debug_msg("multi letter",x)
        head, newn = node.insert_before(head, nextn, g(x))
    end
    return head
  end
end

local function normalize_marks(head, n)
  local lang, font, subtype = n.lang, n.font, n.subtype
  local text = {}
  text[#text+1] = char(n.char)
  local head, nextn = node.remove(head, n)
  --local nextn = n.next
  local info = get_mark(nextn)
  while(info) do
    text[#text+1] = info
    head, nextn = node.remove(head,nextn)
    info = get_mark(nextn)
  end
  local s = NFC(table.concat(text))
  debug_msg("We've got mark: " .. s)
  local new_n = node.new(37, subtype)
  new_n.lang = lang
  new_n.font = font
  new_n.char = byte(s)
  --head, new_n = node.insert_before(head, nextn, new_n)
  -- head, new_n = node.insert_before(head, nextn, make_glyphs(s, lang, font, subtype))
  head, new_n = make_glyphs(head, nextn, s, lang, font, subtype)
  local t = {}
  for x in node.traverse_id(37,head) do
    t[#t+1] = char(x.char)
  end
  debug_msg("Variables ", table.concat(t,":"), table.concat(text,";"), char(byte(s)),length(s))
  return head, nextn
end

local function normalize_glyphs(head, n)
  --local charcode = n.char
  --local category = get_category(charcode)
  local charcode, category = glyph_info(n)
  if letter_categories[category] then 
    local nextn = n.next
    if nextn.id == 37 then
      --local nextchar = nextn.char
      --local nextcat = get_category(nextchar)
      local nextchar, nextcat = glyph_info(nextn)
      if mark_categories[nextcat] then
        return normalize_marks(head,n)
      end
    end
  end
  return head, n.next 
end


function M.nodes(head)
    local t = {}
    local text = false
  local n = head
    -- for n in node.traverse(head) do
  while n do
        if n.id == 37 then
      local charcode = n.char
            debug_msg("unicode name",name(charcode))
            debug_msg("character category",get_category(charcode))
            t[#t+1]= char(charcode)
            text = true
      head, n = normalize_glyphs(head, n)
        else
            if text then
                local s = table.concat(t)
                debug_msg("text chunk",s)
                --printchars(NFC(s))
                debug_msg("----------")
            end
            text = false
            t = {}
      n = n.next
        end
    end
    return head
end

--[[
-- These functions aren't needed when processing buffer. We can call NFC on the whole input line
local unibytes = {}

local function get_charcategory(s)
  local s = s or ""
  local b = unibytes[s] or byte(s) or 0
  unibytes[s] = b
  return get_category(b)
end

local function normalize_charmarks(t,i)
  local c = {t[i]}
  local i = i + 1
  local s = get_charcategory(t[i])
  while mark_categories[s] do
    c[#c+1] = t[i]
    i = i + 1
    s = get_charcategory(t[i])
  end
  return NFC(table.concat(c)), i
end

local function normalize_char(t,i)
  local ch = t[i]
  local c = get_charcategory(ch)
  if letter_categories[c] then
    local nextc = get_charcategory(t[i+1])
    if mark_categories[nextc] then
      return normalize_charmarks(t,i)
    end
  end
  return ch, i+1
end
-- ]]
function M.buffer(line)
  --[[
  local t = {}
  local new_t = {}
  -- we need to make table witl all uni chars on the line
  for x in gmatch(line,".") do
    t[#t+1] = x
  end
  local i = 1
  -- normalize next char
  local c, i = normalize_char(t, i)
  new_t[#new_t+1] = c
  while t[i] do
    c, i = normalize_char(t,i)
    -- local  c = t[i]
    -- i =  i + 1
    new_t[#new_t+1] = c
  end
  return table.concat(new_t)
  --]]
  return NFC(line)
end


return M

und jetzt ist es Zeit für ein paar Bilder.

ohne Normalisierung:

Bildbeschreibung hier eingeben

Sie können sehen, dass zusammengesetzte griechische Zeichen falsch sind, andere Kombinationen werden von Linux Libertine unterstützt

mit Knotennormalisierung:

Bildbeschreibung hier eingeben

Griechische Buchstaben sind richtig, aber „ íin first“ přílišist falsch. Das ist das Problem, von dem ich gesprochen habe.

und nun die Puffernormalisierung:

Bildbeschreibung hier eingeben

alles ist jetzt in Ordnung

XeLaTeX, LuaLaTeX, Fontspec, Unicode und Normalisierung

Antwort1

verwandte Informationen