% \iffalse meta-comment % %% File: l3unicode.dtx % % Copyright (C) 2018-2024 The LaTeX Project % % It may be distributed and/or modified under the conditions of the % LaTeX Project Public License (LPPL), either version 1.3c of this % license or (at your option) any later version. The latest version % of this license is in the file % % https://www.latex-project.org/lppl.txt % % This file is part of the "l3kernel bundle" (The Work in LPPL) % and all files in that bundle must be distributed together. % % ----------------------------------------------------------------------- % % The development version of the bundle can be found at % % https://github.com/latex3/latex3 % % for those people who are interested. % %<*driver> \documentclass[full,kernel]{l3doc} \begin{document} \DocInput{\jobname.dtx} \end{document} % % \fi % % \title{^^A % The \pkg{l3unicode} module\\ Unicode support functions^^A % } % % \author{^^A % The \LaTeX{} Project\thanks % {^^A % E-mail: % \href{mailto:latex-team@latex-project.org} % {latex-team@latex-project.org}^^A % }^^A % } % % \date{Released 2024-11-02} % % \maketitle % % \begin{documentation} % % This module provides Unicode-specific functions along with loading data % from a range of Unicode Consortium files. Most of the code here is % internal, but there are a small set of public functions. These work with % Unicode \meta{codepoints} and are designed to give usable results with % both Unicode-aware and $8$-bit engines. % % \begin{function}[EXP, added = 2022-10-09, updated = 2022-11-09] % {\codepoint_generate:nn} % \begin{syntax} % \cs{codepoint_generate:nn} \Arg{codepoint} \Arg{catcode} % \end{syntax} % Generates one or more character tokens representing the \meta{codepoint}. % With Unicode engines, exactly one character token will be generated, and % this will have the \meta{catcode} specified as the second argument: % \begin{itemize} % \item $1$ (begin group) % \item $2$ (end group) % \item $3$ (math toggle) % \item $4$ (alignment) % \item $6$ (parameter) % \item $7$ (math superscript) % \item $8$ (math subscript) % \item $10$ (space) % \item $11$ (letter) % \item $12$ (other) % \item $13$ (active) % \end{itemize} % For $8$-bit engines, between one and four character tokens will be % produced: these will be the bytes of the UTF-8 representation of the % \meta{codepoint}. For all codepoints outside of the classical ASCII % range, the generated character tokens will be active (category code % $13$); for codepoints in the ASCII range, the given \meta{catcode} % will be used. To allow the result of this function to be used % inside an expansion context, the result is protected by \cs{exp_not:n}. % % \begin{texnote} % Users of (u)p\TeX{} note that these engines are treated as $8$-bit in % this context. In particular, for up\TeX{}, irrespective of the % \tn{kcatcode} of the \meta{codepoint}, any value outside the ASCII range % will result in a series of active bytes being generated. % \end{texnote} % \end{function} % % \begin{function}[EXP, added = 2022-10-09] % {\codepoint_str_generate:n} % \begin{syntax} % \cs{codepoint_str_generate:n} \Arg{codepoint} % \end{syntax} % Generates one or more character tokens representing the \meta{codepoint}. % With Unicode engines, exactly one character token will be generated. % For $8$-bit engines, between one and four character tokens will be % produced: these will be the bytes of the UTF-8 representation of the % \meta{codepoint}. All of the generated character tokens will be of % category code $12$, except any spaces (codepoint $32$), which will be % category code $10$. % \end{function} % % \begin{function}[added = 2023-06-19, EXP]{\codepoint_to_category:n} % \begin{syntax} % \cs{codepoint_to_category:n} \Arg{codepoint} % \end{syntax} % Expands to the Unicode general category identifier of the \meta{codepoint}. % The general category identifier is a string made up of two letter % characters, the first uppercase and the second lowercase. The uppercase % letters divide codepoints into broader groups, which are then refined % by the lowercase letter. For example, codepoints representing letters % all have identifiers starting \texttt{L}, for example \texttt{Lu} % (uppercase letter), \texttt{Lt} (titlecase letter), \emph{etc.} % Full details are available in the documentation provided by the Unicode % Consortium: see % \url{https://www.unicode.org/reports/tr44/#General_Category_Values} % \end{function} % % \begin{function}[added = 2022-10-09, EXP]{\codepoint_to_nfd:n} % \begin{syntax} % \cs{codepoint_to_nfd:n} \Arg{codepoint} % \end{syntax} % Converts the \meta{codepoint} to the Unicode Normalization % Form Canonical Decomposition. The generated character(s) will have % the current category code as they would if typed in directly for Unicode % engines; for $8$-bit engines, active characters are used for all codepoints % outside of the ASCII range. % \end{function} % % \end{documentation} % % \begin{implementation} % % \section{\pkg{l3unicode} implementation} % % \begin{macrocode} %<*package> % \end{macrocode} % % \begin{macrocode} %<@@=codepoint> % \end{macrocode} % % \subsection{User functions} % % \begin{macro}[EXP]{\codepoint_str_generate:n} % \begin{macro}[EXP]{\@@_str_generate:nnnn} % \begin{macro}[EXP]{\codepoint_generate:nn} % \begin{macro}[EXP]{\@@_generate:nnnn} % \begin{macro}[EXP]{\@@_generate:n} % Conversion of a codepoint to a character (Unicode engines) or to one % or more bytes ($8$-bit engines) is required. For loading the data, % all that is needed is the form which creates strings: these are outside % the group as they will also be used when looking up data in the hash % table storage at point-of-use. Later, we will also need functions that % can generate character tokens for document use: those are defined below, % in the data recovery setup. % \begin{macrocode} \bool_lazy_or:nnTF { \sys_if_engine_luatex_p: } { \sys_if_engine_xetex_p: } { \cs_new:Npn \codepoint_str_generate:n #1 { \int_compare:nNnTF {#1} = { `\ } { ~ } { \char_generate:nn {#1} { 12 } } } \cs_new:Npn \codepoint_generate:nn #1#2 { \int_compare:nNnTF {#1} = { `\ } { ~ } { \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN { \char_generate:nn {#1} {#2} } } } } { \cs_new:Npn \codepoint_str_generate:n #1 { \int_compare:nNnTF {#1} = { `\ } { ~ } { \use:e { \exp_not:N \@@_str_generate:nnnn \__kernel_codepoint_to_bytes:n {#1} } } } \cs_new:Npn \@@_str_generate:nnnn #1#2#3#4 { \char_generate:nn {#1} { 12 } \tl_if_blank:nF {#2} { \char_generate:nn {#2} { 12 } \tl_if_blank:nF {#3} { \char_generate:nn {#3} { 12 } \tl_if_blank:nF {#4} { \char_generate:nn {#4} { 12 } } } } } \cs_new:Npn \codepoint_generate:nn #1#2 { \int_compare:nNnTF {#1} = { `\ } { ~ } { \int_compare:nNnTF {#1} < { "80 } { \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN { \char_generate:nn {#1} {#2} } } { \use:e { \exp_not:N \@@_generate:nnnn \__kernel_codepoint_to_bytes:n {#1} } } } } \cs_new:Npn \@@_generate:nnnn #1#2#3#4 { \__kernel_exp_not:w \exp_after:wN { \tex_expanded:D { \@@_generate:n {#1} \@@_generate:n {#2} \tl_if_blank:nF {#3} { \@@_generate:n {#3} \tl_if_blank:nF {#4} { \@@_generate:n {#4} } } } } } \cs_new:Npn \@@_generate:n #1 { \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN { \char_generate:nn {#1} { 13 } } } } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \begin{macro}[EXP]{\__kernel_codepoint_to_bytes:n} % \begin{macro}[EXP]{\@@_to_bytes_auxi:n} % \begin{macro}[EXP]{\@@_to_bytes_auxii:Nnn} % \begin{macro}[EXP]{\@@_to_bytes_auxiii:n} % \begin{macro}[EXP] % { % \@@_to_bytes_outputi:nw , % \@@_to_bytes_outputii:nw , % \@@_to_bytes_outputiii:nw , % \@@_to_bytes_outputiv:nw % } % \begin{macro}[EXP] % {\@@_to_bytes_output:nnn, \@@_to_bytes_output:fnn} % \begin{macro}[EXP]{\@@_to_bytes_end:} % This code converts a codepoint into the correct UTF-8 representation. % In terms of the algorithm itself, see % \url{https://en.wikipedia.org/wiki/UTF-8} for the octet pattern. % \begin{macrocode} \cs_new:Npn \__kernel_codepoint_to_bytes:n #1 { \exp_args:Nf \@@_to_bytes_auxi:n { \int_eval:n {#1} } } \cs_new:Npn \@@_to_bytes_auxi:n #1 { \if_int_compare:w #1 > "80 \exp_stop_f: \if_int_compare:w #1 < "800 \exp_stop_f: \@@_to_bytes_outputi:nw { \@@_to_bytes_auxii:Nnn C {#1} { 64 } } \@@_to_bytes_outputii:nw { \@@_to_bytes_auxiii:n {#1} } \else: \if_int_compare:w #1 < "10000 \exp_stop_f: \@@_to_bytes_outputi:nw { \@@_to_bytes_auxii:Nnn E {#1} { 64 * 64 } } \@@_to_bytes_outputii:nw { \@@_to_bytes_auxiii:n { \int_div_truncate:nn {#1} { 64 } } } \@@_to_bytes_outputiii:nw { \@@_to_bytes_auxiii:n {#1} } \else: \@@_to_bytes_outputi:nw { \@@_to_bytes_auxii:Nnn F {#1} { 64 * 64 * 64 } } \@@_to_bytes_outputii:nw { \@@_to_bytes_auxiii:n { \int_div_truncate:nn {#1} { 64 * 64 } } } \@@_to_bytes_outputiii:nw { \@@_to_bytes_auxiii:n { \int_div_truncate:nn {#1} { 64 } } } \@@_to_bytes_outputiv:nw { \@@_to_bytes_auxiii:n {#1} } \fi: \fi: \else: \@@_to_bytes_outputi:nw {#1} \fi: \@@_to_bytes_end: { } { } { } { } } \cs_new:Npn \@@_to_bytes_auxii:Nnn #1#2#3 { "#10 + \int_div_truncate:nn {#2} {#3} } \cs_new:Npn \@@_to_bytes_auxiii:n #1 { \int_mod:nn {#1} { 64 } + 128 } \cs_new:Npn \@@_to_bytes_outputi:nw #1 #2 \@@_to_bytes_end: #3 { \@@_to_bytes_output:fnn { \int_eval:n {#1} } { } {#2} } \cs_new:Npn \@@_to_bytes_outputii:nw #1 #2 \@@_to_bytes_end: #3#4 { \@@_to_bytes_output:fnn { \int_eval:n {#1} } { {#3} } {#2} } \cs_new:Npn \@@_to_bytes_outputiii:nw #1 #2 \@@_to_bytes_end: #3#4#5 { \@@_to_bytes_output:fnn { \int_eval:n {#1} } { {#3} {#4} } {#2} } \cs_new:Npn \@@_to_bytes_outputiv:nw #1 #2 \@@_to_bytes_end: #3#4#5#6 { \@@_to_bytes_output:fnn { \int_eval:n {#1} } { {#3} {#4} {#5} } {#2} } \cs_new:Npn \@@_to_bytes_output:nnn #1#2#3 { #3 \@@_to_bytes_end: #2 {#1} } \cs_generate_variant:Nn \@@_to_bytes_output:nnn { f } \cs_new:Npn \@@_to_bytes_end: { } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \begin{macro}[EXP]{\codepoint_to_category:n} % Get the value and convert back to the string. % \begin{macrocode} \cs_new:Npn \codepoint_to_category:n #1 { \cs:w c_@@_category_ \tex_romannumeral:D \__kernel_codepoint_data:nn { category } {#1} _str \cs_end: } % \end{macrocode} % \end{macro} % % \begin{macro}[EXP]{\codepoint_to_nfd:n, \@@_to_nfd:n} % \begin{macro}[EXP]{\@@_to_nfd:nn} % \begin{macro}[EXP]{\@@_to_nfd:nnn} % \begin{macro}[EXP]{\@@_to_nfd:nnnn} % Converted to NFD is a potentially-recursive process: the key is to % check if we get the input codepoint back again. As far as possible, % we use the same path for all engines. % \begin{macrocode} \cs_new:Npn \codepoint_to_nfd:n #1 { \exp_args:Ne \@@_to_nfd:n { \int_eval:n {#1} } } \cs_new:Npn \@@_to_nfd:n #1 { \@@_to_nfd:nn {#1} { \char_value_catcode:n {#1} } } \bool_lazy_or:nnF { \sys_if_engine_luatex_p: } { \sys_if_engine_xetex_p: } { \cs_gset:Npn \@@_to_nfd:n #1 { \int_compare:nNnTF {#1} > { "80 } { \@@_to_nfd:nn {#1} { 12 } } { \@@_to_nfd:nn {#1} { \char_value_catcode:n {#1} } } } } \cs_new:Npn \@@_to_nfd:nn #1#2 { \exp_args:Ne \@@_to_nfd:nnn { \@@_nfd:n {#1} } {#1} {#2} } \cs_new:Npn \@@_to_nfd:nnn #1#2#3 { \@@_to_nfd:nnnn #1 {#2} {#3} } \cs_new:Npn \@@_to_nfd:nnnn #1#2#3#4 { \int_compare:nNnTF {#1} = {#3} { \codepoint_generate:nn {#1} {#4} } { \@@_to_nfd:nn {#1} {#4} \tl_if_blank:nF {#2} { \@@_to_nfd:nn {#2} {#4} } } } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \subsection{Data loader} % % Text operations requires data from the Unicode Consortium. Data read into % Unicode engine formats is at best a small part of what we need, so there % is a loader here to set up the appropriate data structures. % % Where we need data for most or all of the Unicode range, we use the two-stage % table approach recommended by the Unicode Consortium and demonstrated in a % model implementation in Python in % \url{https://www.strchr.com/multi-stage_tables}. This approach uses the % \texttt{intarray} (\texttt{fontdimen}-based) data type as it is fast for % random access and avoids significant hash table usage. In contrast, where % only a small subset of codepoints are required, storage as macros is % preferable. There is also some consideration of the effort needed to load % data: see for example the grapheme breaking information, which would be % problematic to convert into a two-stage table but which can be used with % reasonable performance in a small number of comma lists (at the cost that % breaking at higher codepoint Hangul characters will be slightly slow). % % \begin{variable}{\c_@@_block_size_int} % Choosing the block size for the blocks in the two-stage approach is % non-trivial: depending on the data stored, the optimal size for % memory usage will vary. At the same time, for us there is also the % question of load-time: larger blocks require longer comma lists % as intermediates, so are slower. As this is going to be needed % to use the data, we set it up outside of the group for clarity. % \begin{macrocode} \int_const:Nn \c_@@_block_size_int { 64 } % \end{macrocode} % \end{variable} % % Parsing the data files can be the same way for all engines, but where they % are stored as character tokens, the construction method depends on whether % they are Unicode or $8$-bit internally. Parsing is therefore done by common % functions, with some data storage using engine-specific auxiliaries. % % As only the data needs to remain at the end of this process, everything % is set up inside a group. The only thing that is outside is creating a % stream: they are global anyway and it is best to force a stream for % all engines. % % \begin{variable}{\g_@@_data_ior} % \begin{macrocode} \ior_new:N \g_@@_data_ior % \end{macrocode} % \end{variable} % % We need some setup for the two-part table approach. The number of blocks we % need will be variable, but the resulting size of the stage one table % is predictable. For performance reasons, we therefore create the stage one % tables now so they can be used immediately, and will later rename them as a % constant tables. For each two-stage table construction, we need a comma % list to hold the partial block and a couple of integers to track where % we are up to. To avoid burning registers, the latter are stored in macros % and are \enquote{fake} integers. We also avoid any \texttt{new} functions, % keeping as much as possible local. % % As we need both positive and negative values, case data requires one % two-stage table for each transformation. In contrasts, general Unicode % properties could be stored in one table with appropriate combination rules: % that is not done at present but is likely to be added over time. Here, all % that is needed is additional entries into the comma-list to create the % structures. % % Notice that in the standard \pkg{expl3} way we are indexes position not % offset: that does mean a little work later. % \begin{macrocode} \group_begin: \clist_map_inline:nn { category , uppercase , lowercase } { \cs_set_nopar:cpn { l_@@_ #1 _block_clist } { } \cs_set_nopar:cpn { l_@@_ #1 _block_tl } { 1 } \cs_set_nopar:cpn { l_@@_ #1 _pos_tl } { 0 } \intarray_new:cn { g_@@_ #1 _index_intarray } { \int_div_truncate:nn { "110000 } \c_@@_block_size_int } } % \end{macrocode} % We need an integer value when matching the current block to those we have % already seen, and a way to track codepoints for handling ranges. Again, % we avoid using up registers or creating global names. % \begin{macrocode} \cs_set_nopar:Npn \l_@@_next_codepoint_fint_tl { 0 } \cs_set_nopar:Npn \l_@@_matched_block_tl { 0 } % \end{macrocode} % For Unicode general category, there needs to be numerical representation of % each possible value. As we need to go from string to number here, but the % other way elsewhere, we set up fast mappings both ways, but one set local % and the other as constants. % \begin{macrocode} \cs_set_protected:Npn \@@_data_auxi:w #1#2 { \quark_if_recursion_tail_stop:n {#2} \cs_set_nopar:cpn { l_@@_category_ #2 _tl } {#1} \str_const:cn { c_@@_category_ \tex_romannumeral:D #1 _str } {#2} \exp_args:Ne \@@_data_auxi:w { \int_eval:n { #1 + 1 } } } \@@_data_auxi:w { 1 } { Lu } { Ll } { Lt } { Lm } { Lo } { Mn } { Me } { Mc } { Nd } { Nl } { No } { Zs } { Zl } { Zp } { Cc } { Cf } { Co } { Cs } { Cn } { Pd } { Ps } { Pe } { Pc } { Po } { Pi } { Pf } { Sm } { Sc } { Sk } { So } \q_recursion_tail \q_recursion_stop % \end{macrocode} % Parse the main Unicode data file and pull out the NFD and case changing % data. The NFD data is stored on using the hash table approach and can yield % a predictable number of codepoints: one or two. We also need the case data, % which will be modified further below. To allow for finding ranges, the % description of the codepoint needs to be carried forward. % \begin{macrocode} \cs_set_protected:Npn \@@_data_auxi:w #1 ; #2 ; #3 ; #4 ; #5 ; #6 ; #7 ; #8 ; #9 ; { \tl_if_blank:nF {#6} { \tl_if_head_eq_charcode:nNF {#6} < % > { \@@_data_auxii:w #1 ; #6 ~ \q_stop } } \@@_data_auxiii:w #1 ; #2 ; #3 ; } \cs_set_protected:Npn \@@_data_auxii:w #1 ; #2 ~ #3 \q_stop { \tl_const:ce { c_@@_nfd_ \codepoint_str_generate:n {"#1} _tl } { {"#2} { \tl_if_blank:nF {#3} {"#3} } } } % \end{macrocode} % The category data needs to be converted from a string to the numerical % equivalent: a simple operation. % The case data is going to be stored as an offset from the parent character, % rather than an absolute value. We therefore deal with that plus the situation % where a codepoint has no mapping data in one shot. % \begin{macrocode} \cs_set_protected:Npn \@@_data_auxiii:w #1 ; #2 ; #3 ; #4 ; #5 ; #6 ; #7 ; #8 ; #9 ~ \q_stop { \use:e { \@@_data_auxiv:w #1 ; #2 ; \@@_data_category:n {#3} ; \@@_data_offset:nn {#1} {#7} ; \@@_data_offset:nn {#1} {#8} ; #9; } } \cs_set:Npn \@@_data_category:n #1 { \use:c { l_@@_category_ #1 _tl } } \cs_set:Npn \@@_data_offset:nn #1#2 { \tl_if_blank:nTF {#2} { 0 } { \int_eval:n { "#2 - "#1 } } } % \end{macrocode} % To deal with ranges, we track the position of the next codepoint expected. % If there is a gap, we deal with that separately: it could be a range or % an unused part of the Unicode space. As such, we deal with the current % codepoint here whether or not there is range to fill in. Upper- and % lowercase data go into the two-stage table, any titlecase exception is % just stored in a macro. The data for the codepoint is added to the current % block, and if that is now complete we move on to save the block. The % case exceptions are all stored as codepoints, with a fixed number of % balanced text as we know that there are never more than three. % \begin{macrocode} \cs_set_protected:Npn \@@_data_auxiv:w #1 ; #2 ; #3 ; #4 ; #5 ; #6 ; { \int_compare:nNnT {"#1} > \l_@@_next_codepoint_fint_tl { \@@_data_auxv:nnnnw {#1} {#3} {#4} {#5} #2 Last> \q_stop } \@@_add:nn { category } {#3} \@@_add:nn { uppercase } {#4} \@@_add:nn { lowercase } {#5} \int_compare:nNnF {#4} = { \@@_data_offset:nn {#1} {#6} } { \tl_const:ce { c_@@_titlecase_ \codepoint_str_generate:n {"#1} _tl } { {"#6} { } { } } } \tl_set:Ne \l_@@_next_codepoint_fint_tl { \int_eval:n { "#1 + 1 } } } \cs_set_protected:Npn \@@_add:nn #1#2 { \clist_put_right:cn { l_@@_ #1 _block_clist } {#2} \int_compare:nNnT { \clist_count:c { l_@@_ #1 _block_clist } } = \c_@@_block_size_int { \@@_save_blocks:nn {#1} { 1 } } } % \end{macrocode} % Distinguish between a range and a gap, and pass on the appropriate value(s). % The general category for unassigned characters is \texttt{Cn}, so we % find the correct value once and then use that. % \begin{macrocode} \cs_set_protected:Npe \@@_data_auxv:nnnnw #1#2#3#4#5 Last> #6 \q_stop { \exp_not:N \tl_if_blank:nTF {#6} { \exp_not:N \@@_range:nnn {#1} { category } { \exp_not:V \l_@@_category_Cn_tl } \exp_not:N \@@_range:nnn {#1} { uppercase } { 0 } \exp_not:N \@@_range:nnn {#1} { lowercase } { 0 } } { \exp_not:N \@@_range:nnn {#1} { category } {#2} \exp_not:N \@@_range:nnn {#1} { uppercase } {#3} \exp_not:N \@@_range:nnn {#1} { lowercase } {#4} } } % \end{macrocode} % Calculated the length of the range and the space remaining in the current % block. % \begin{macrocode} \cs_set_protected:Npn \@@_range:nnn #1 { \exp_args:Nf \@@_range_aux:nnn { \int_eval:n { "#1 - \l_@@_next_codepoint_fint_tl } } } \cs_set_protected:Npn \@@_range_aux:nnn #1#2 { \exp_args:Nf \@@_range:nnnn { \int_min:nn {#1} { \c_@@_block_size_int - \clist_count:c { l_@@_ #2 _block_clist } } } {#1} {#2} } % \end{macrocode} % Here we want to do three things: add to and possibly complete the current % block, add complete blocks quickly, then finish up the range in a final % open block. We need to avoid as far as possible avoid dealing with every % single codepoint, so the middle step is optimised. % \begin{macrocode} \cs_set_protected:Npn \@@_range:nnnn #1#2#3#4 { \prg_replicate:nn {#1} { \clist_put_right:cn { l_@@_ #3 _block_clist } {#4} } \int_compare:nNnT { \clist_count:c { l_@@_ #3 _block_clist } } = \c_@@_block_size_int { \@@_save_blocks:nn {#3} { 1 } } \int_compare:nNnF { \int_div_truncate:nn { #2 - #1 } \c_@@_block_size_int } = 0 { \tl_set:ce { l_@@_ #3 _block_clist } { \exp_args:NNe \use:nn \use_none:n { \prg_replicate:nn { \c_@@_block_size_int } { , #4 } } } \@@_save_blocks:nn {#3} { \int_div_truncate:nn { (#2 - #1) } \c_@@_block_size_int } } \prg_replicate:nn { \int_mod:nn { #2 - #1 } \c_@@_block_size_int } { \clist_put_right:ce { l_@@_ #3 _block_clist } {#4} } } % \end{macrocode} % To allow rapid comparison, each completed block is stored locally as a % comma list: once all of the blocks have been created, they are converted % into an \texttt{intarray} in one step. The aim here is to check the current % block against those we've already used, and either match to an existing % block or save a new block. % \begin{macrocode} \cs_set_protected:Npn \@@_save_blocks:nn #1#2 { \tl_set_eq:Nc \l_@@_matched_block_tl { l_@@_ #1 _block_tl } \int_step_inline:nn { \tl_use:c { l_@@_ #1 _block_tl } - 1 } { \tl_if_eq:ccT { l_@@_ #1 _block_clist } { l_@@_ #1 _block_ ##1 _clist } { \tl_set:Nn \l_@@_matched_block_tl {##1} } } \int_compare:nNnT { \tl_use:c { l_@@_ #1 _block_tl } } = \l_@@_matched_block_tl { \clist_set_eq:cc { l_@@_ #1 _block_ \tl_use:c { l_@@_ #1 _block_tl } _clist } { l_@@_ #1 _block_clist } \tl_set:ce { l_@@_ #1 _block_tl } { \int_eval:n { \tl_use:c { l_@@_ #1 _block_tl } + 1 } } } % \end{macrocode} % Here, we avoid \cs{prg_replicate:nn} as the number of tokens generated would be % high: that shows in the format dump (although \TeX{} recovers memory during % the subsequent runs). % \begin{macrocode} \int_step_inline:nnn { \tl_use:c { l_@@_ #1 _pos_tl } + 1 } { \tl_use:c { l_@@_ #1 _pos_tl } + #2 } { \exp_args:Nc \__kernel_intarray_gset:Nnn { g_@@_ #1 _index_intarray } {##1} \l_@@_matched_block_tl } \tl_set:ce { l_@@_ #1 _pos_tl } { \int_eval:n { \tl_use:c { l_@@_ #1 _pos_tl } + #2 } } \clist_clear:c { l_@@_ #1 _block_clist } } % \end{macrocode} % Close out the final block, rename the first stage table, then combine all % of the block comma-lists into one large second-stage table with offsets. % As we use an index not an offset, there is a little back-and-forward to do. % \begin{macrocode} \cs_set_protected:Npn \@@_finalise_blocks: { \clist_map_inline:nn { category , uppercase , lowercase } { \@@_range:nnn { 110000 } {##1} { 0 } \@@_finalise_blocks:n {##1} } } \cs_set_protected:Npn \@@_finalise_blocks:n #1 { \cs_gset_eq:cc { c_@@_ #1 _index_intarray } { g_@@_ #1 _index_intarray } \cs_undefine:c { g_@@_ #1 _index_intarray } \intarray_new:cn { g_@@_ #1 _blocks_intarray } { ( \tl_use:c { l_@@_ #1 _block_tl } - 1 ) * \c_@@_block_size_int } \int_step_inline:nn { \tl_use:c { l_@@_ #1 _block_tl } - 1 } { \exp_args:Nv \@@_finalise_blocks:nnn { l_@@_ #1 _block_ ##1 _clist } {##1} {#1} } \cs_gset_eq:cc { c_@@_ #1 _blocks_intarray } { g_@@_ #1 _blocks_intarray } \cs_undefine:c { g_@@_ #1 _blocks_intarray } } \cs_set_protected:Npn \@@_finalise_blocks:nnn #1#2#3 { \exp_args:Nnf \@@_finalise_blocks:nnnw { 1 } { \int_eval:n { ( #2 - 1 ) * \c_@@_block_size_int } } {#3} #1 , \q_recursion_tail , \q_recursion_stop } \cs_set_protected:Npn \@@_finalise_blocks:nnnw #1#2#3#4 , { \quark_if_recursion_tail_stop:n {#4} \intarray_gset:cnn { g_@@_ #3 _blocks_intarray } { #1 + #2 } {#4} \exp_args:Nf \@@_finalise_blocks:nnnw { \int_eval:n { #1 + 1 } } {#2} {#3} } % \end{macrocode} % With the setup done, read the main data file: it's easiest to do that as % a token list with spaces retained. % \begin{macrocode} \ior_open:Nn \g_@@_data_ior { UnicodeData.txt } \group_begin: \char_set_catcode_space:n { `\ }% \ior_map_variable:NNn \g_@@_data_ior \l_@@_tmpa_tl {% \if_meaning:w \l_@@_tmpa_tl \c_space_tl \exp_after:wN \ior_map_break: \fi: \exp_after:wN \@@_data_auxi:w \l_@@_tmpa_tl \q_stop }% \@@_finalise_blocks: \group_end: \group_end: % \end{macrocode} % % \begin{macro}[EXP]{\__kernel_codepoint_data:nn} % \begin{macro}[EXP]{\@@_data:nnn} % Recover data from a two-stage table: entirely generic as this applies to % all tables (as we use the same block size for all of them). Notice that % as we use indices not offsets we have to shuffle out-by-one issues. This % function is needed \emph{before} loading the special casing data, as there % we need to be able to check the standard case mappings. % \begin{macrocode} \cs_new:Npn \__kernel_codepoint_data:nn #1#2 { \exp_args:Nf \@@_data:nnn { \int_eval:n { \c_@@_block_size_int * ( \intarray_item:cn { c_@@_ #1 _index_intarray } { \int_div_truncate:nn {#2} \c_@@_block_size_int + 1 } - 1 ) } } {#2} {#1} } \cs_new:Npn \@@_data:nnn #1#2#3 { \intarray_item:cn { c_@@_ #3 _blocks_intarray } { #1 + \int_mod:nn {#2} \c_@@_block_size_int + 1 } } % \end{macrocode} % \end{macro} % \end{macro} % % The other data files all use C-style comments so we have to worry about % |#| tokens (and reading as strings). The set up for case folding is in two % parts. For the basic (core) mappings, folding is the same as lower casing in % most positions so only store the differences. For the more complex foldings, % always store the result, splitting up the two or three code points in the input % as required. % \begin{macrocode} \group_begin: \ior_open:Nn \g_@@_data_ior { CaseFolding.txt } \cs_set_protected:Npn \@@_data_auxi:w #1 ;~ #2 ;~ #3 ; #4 \q_stop { \if:w \tl_head:n { #2 ? } C \reverse_if:N \if_int_compare:w \int_eval:n { \__kernel_codepoint_data:nn { lowercase } {"#1} + "#1 } = "#3 ~ \tl_const:ce { c_@@_casefold_ \codepoint_str_generate:n {"#1} _tl } { {"#3} { } { } } \fi: \else: \if:w \tl_head:n { #2 ? } F \@@_data_auxii:w #1 ~ #3 ~ \q_stop \fi: \fi: } % \end{macrocode} % Here, |#4| can have a trailing space, so we tidy up a bit at the cost of % speed for these small number of cases it applies to. % \begin{macrocode} \cs_set_protected:Npn \@@_data_auxii:w #1 ~ #2 ~ #3 ~ #4 \q_stop { \tl_const:ce { c_@@_casefold_ \codepoint_str_generate:n {"#1} _tl } { {"#2} {"#3} { \tl_if_blank:nF {#4} { " \int_to_Hex:n {"#4} } } } } \ior_str_map_inline:Nn \g_@@_data_ior { \reverse_if:N \if:w \c_hash_str \tl_head:w #1 \c_hash_str \q_stop \@@_data_auxi:w #1 \q_stop \fi: } \ior_close:N \g_@@_data_ior % \end{macrocode} % For upper- and lowercasing special situations, there is a bit more to % do as we also have titlecasing to consider, plus we need to stop part-way % through the file. % \begin{macrocode} \ior_open:Nn \g_@@_data_ior { SpecialCasing.txt } \cs_set_protected:Npn \@@_data_auxi:w #1 ;~ #2 ;~ #3 ;~ #4 ; #5 \q_stop { \use:n { \@@_data_auxii:w #1 ~ lower ~ #2 ~ } ~ \q_stop \use:n { \@@_data_auxii:w #1 ~ upper ~ #4 ~ } ~ \q_stop \str_if_eq:nnF {#3} {#4} { \use:n { \@@_data_auxii:w #1 ~ title ~ #3 ~ } ~ \q_stop } } \cs_set_protected:Npn \@@_data_auxii:w #1 ~ #2 ~ #3 ~ #4 ~ #5 \q_stop { \tl_if_empty:nF {#4} { \tl_const:ce { c_@@_ #2 case_ \codepoint_str_generate:n {"#1} _tl } { {"#3} {"#4} { \tl_if_blank:nF {#5} {"#5} } } } } \ior_str_map_inline:Nn \g_@@_data_ior { \str_if_eq:eeTF { \tl_head:w #1 \c_hash_str \q_stop } { \c_hash_str } { \str_if_eq:eeT {#1} { \c_hash_str \c_space_tl Conditional~Mappings } { \ior_map_break: } } { \@@_data_auxi:w #1 \q_stop } } \ior_close:N \g_@@_data_ior \group_end: % \end{macrocode} % % \begin{macro}[EXP]{\__kernel_codepoint_case:nn} % \begin{macro}[EXP]{\@@_case:nnn} % \begin{macro}[EXP] % {\@@_uppercase:n, \@@_lowercase:n, \@@_titlecase:n, \@@_casefold:n} % \begin{macro}[EXP]{\@@_case:nn} % With the core data files loaded, there is now a need to provide access to % this information for other modules. That is done here such that case % folding can also be covered. At this level, all that needs to be returned % is the % \begin{macrocode} \cs_new:Npn \__kernel_codepoint_case:nn #1#2 { \exp_args:Ne \@@_case:nnn { \codepoint_str_generate:n {#2} } {#1} {#2} } \cs_new:Npn \@@_case:nnn #1#2#3 { \cs_if_exist:cTF { c_@@_ #2 _ #1 _tl } { \tl_use:c { c_@@_ #2 _ #1 _tl } } { \use:c { @@_ #2 :n } {#3} } } \cs_new:Npn \@@_uppercase:n { \@@_case:nn { uppercase } } \cs_new:Npn \@@_lowercase:n { \@@_case:nn { lowercase } } \cs_new:Npn \@@_titlecase:n { \@@_case:nn { uppercase } } \cs_new:Npn \@@_casefold:n { \@@_case:nn { lowercase } } \cs_new:Npn \@@_case:nn #1#2 { { \int_eval:n { \__kernel_codepoint_data:nn {#1} {#2} + #2 } } { } { } } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \begin{macro}[EXP]{\@@_nfd:n} % \begin{macro}[EXP]{\@@_nfd:nn} % A simple interface. % \begin{macrocode} \cs_new:Npn \@@_nfd:n #1 { \exp_args:Ne \@@_nfd:nn { \codepoint_str_generate:n {#1} } {#1} } \cs_new:Npn \@@_nfd:nn #1#2 { \tl_if_exist:cTF { c_@@_nfd_ #1 _tl } { \tl_use:c { c_@@_nfd_ #1 _tl } } { {#2} { } } } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macrocode} %<@@=text> % \end{macrocode} % % Read the Unicode grapheme data. This is quite easy to handle and we only need % codepoints, not characters, so there is no need to worry about the engine in use. % As reading as a string is most convenient, we have to do some work to remove % spaces: the hardest part of the entire process! % \begin{macrocode} \ior_new:N \g_@@_data_ior \group_begin: \ior_open:Nn \g_@@_data_ior { GraphemeBreakProperty.txt } \cs_set_nopar:Npn \l_@@_tmpa_str { } \cs_set_nopar:Npn \l_@@_tmpb_str { } \cs_set_protected:Npn \@@_data_auxi:w #1 ;~ #2 ~ #3 \q_stop { \str_if_eq:VnF \l_@@_tmpb_str {#2} { \str_if_empty:NF \l_@@_tmpb_str { \clist_const:ce { c_@@_grapheme_ \l_@@_tmpb_str _clist } { \exp_after:wN \use_none:n \l_@@_tmpa_str } \cs_set_nopar:Npn \l_@@_tmpa_str { } } \cs_set_nopar:Npn \l_@@_tmpb_str {#2} } \@@_data_auxii:w #1 .. #1 .. #1 \q_stop } \cs_set_protected:Npn \@@_data_auxii:w #1 .. #2 .. #3 \q_stop { \cs_set_nopar:Npe \l_@@_tmpa_str { \l_@@_tmpa_str , \tl_trim_spaces:n {#1} .. \tl_trim_spaces:n {#2} } } \ior_str_map_inline:Nn \g_@@_data_ior { \str_if_eq:eeF { \tl_head:w #1 \c_hash_str \q_stop } { \c_hash_str } { \tl_if_blank:nF {#1} { \@@_data_auxi:w #1 \q_stop } } } \ior_close:N \g_@@_data_ior \group_end: % \end{macrocode} % % \begin{macrocode} % % \end{macrocode} % % \end{implementation} % % \PrintIndex