Implementation of the line breaking algorithm as described in Unicode Standard Annex 14. More...
#include <assert.h>
#include <stddef.h>
#include <string.h>
#include "linebreak.h"
#include "linebreakdef.h"
Data Structures | |
struct | LineBreakPropertiesIndex |
Struct for the second-level index to the line breaking properties. More... | |
Defines | |
#define | LINEBREAK_UNDEFINED -1 |
Special value used internally to indicate an undefined break result. | |
#define | LINEBREAK_INDEX_SIZE 40 |
Size of the second-level index to the line breaking properties. | |
Enumerations | |
enum | BreakAction { DIR_BRK, IND_BRK, CMI_BRK, CMP_BRK, PRH_BRK } |
Enumeration of break actions. More... | |
Functions | |
void | init_linebreak (void) |
Initializes the second-level index to the line breaking properties. | |
static struct LineBreakProperties * | get_lb_prop_lang (const char *lang) |
Gets the language-specific line breaking properties. | |
static enum LineBreakClass | get_char_lb_class (utf32_t ch, struct LineBreakProperties *lbp) |
Gets the line breaking class of a character from a line breaking properties array. | |
static enum LineBreakClass | get_char_lb_class_default (utf32_t ch) |
Gets the line breaking class of a character from the default line breaking properties array. | |
static enum LineBreakClass | get_char_lb_class_lang (utf32_t ch, struct LineBreakProperties *lbpLang) |
Gets the line breaking class of a character for a specific language. | |
static enum LineBreakClass | resolve_lb_class (enum LineBreakClass lbc, const char *lang) |
Resolves the line breaking class for certain ambiguous or complicated characters. | |
static void | treat_first_char (struct LineBreakContext *lbpCtx) |
Treats specially for the first character in a line. | |
static int | get_lb_result_simple (struct LineBreakContext *lbpCtx) |
Tries telling the line break opportunity by simple rules. | |
static int | get_lb_result_lookup (struct LineBreakContext *lbpCtx) |
Tells the line break opportunity by table lookup. | |
void | lb_init_break_context (struct LineBreakContext *lbpCtx, utf32_t ch, const char *lang) |
Initializes line breaking context for a given language. | |
int | lb_process_next_char (struct LineBreakContext *lbpCtx, utf32_t ch) |
Updates LineBreakingContext for the next code point and returns the detected break. | |
void | set_linebreaks (const void *s, size_t len, const char *lang, char *brks, get_next_char_t get_next_char) |
Sets the line breaking information for a generic input string. | |
void | set_linebreaks_utf8 (const utf8_t *s, size_t len, const char *lang, char *brks) |
Sets the line breaking information for a UTF-8 input string. | |
void | set_linebreaks_utf16 (const utf16_t *s, size_t len, const char *lang, char *brks) |
Sets the line breaking information for a UTF-16 input string. | |
void | set_linebreaks_utf32 (const utf32_t *s, size_t len, const char *lang, char *brks) |
Sets the line breaking information for a UTF-32 input string. | |
int | is_line_breakable (utf32_t char1, utf32_t char2, const char *lang) |
Tells whether a line break can occur between two Unicode characters. | |
Variables | |
static enum BreakAction | baTable [LBP_RI][LBP_RI] |
Break action pair table. | |
static struct LineBreakPropertiesIndex | lb_prop_index [LINEBREAK_INDEX_SIZE] |
Second-level index to the line breaking properties. |
Implementation of the line breaking algorithm as described in Unicode Standard Annex 14.
#define LINEBREAK_INDEX_SIZE 40 |
Size of the second-level index to the line breaking properties.
#define LINEBREAK_UNDEFINED -1 |
Special value used internally to indicate an undefined break result.
enum BreakAction |
static enum LineBreakClass get_char_lb_class | ( | utf32_t | ch, | |
struct LineBreakProperties * | lbp | |||
) | [static] |
Gets the line breaking class of a character from a line breaking properties array.
ch | character to check | |
lbp | pointer to the line breaking properties array |
LBP_XX
otherwise static enum LineBreakClass get_char_lb_class_default | ( | utf32_t | ch | ) | [static] |
Gets the line breaking class of a character from the default line breaking properties array.
ch | character to check |
LBP_XX
otherwise static enum LineBreakClass get_char_lb_class_lang | ( | utf32_t | ch, | |
struct LineBreakProperties * | lbpLang | |||
) | [static] |
Gets the line breaking class of a character for a specific language.
This function will check the language-specific data first, and then the default data if there is no language-specific property available for the character.
ch | character to check | |
lbpLang | pointer to the language-specific line breaking properties array |
LBP_XX
otherwise static struct LineBreakProperties* get_lb_prop_lang | ( | const char * | lang | ) | [static, read] |
Gets the language-specific line breaking properties.
lang | language of the text |
NULL
otherwise static int get_lb_result_lookup | ( | struct LineBreakContext * | lbpCtx | ) | [static] |
Tells the line break opportunity by table lookup.
[in,out] | lbpCtx | pointer to the line breaking context |
static int get_lb_result_simple | ( | struct LineBreakContext * | lbpCtx | ) | [static] |
Tries telling the line break opportunity by simple rules.
[in,out] | lbpCtx | pointer to the line breaking context |
void init_linebreak | ( | void | ) |
Initializes the second-level index to the line breaking properties.
If it is not called, the performance of get_char_lb_class_lang (and thus the main functionality) can be pretty bad, especially for big code points like those of Chinese.
Tells whether a line break can occur between two Unicode characters.
This is a wrapper function to expose a simple interface. Generally speaking, it is better to use set_linebreaks_utf32 instead, since complicated cases involving combining marks, spaces, etc. cannot be correctly processed.
char1 | the first Unicode character | |
char2 | the second Unicode character | |
lang | language of the input |
void lb_init_break_context | ( | struct LineBreakContext * | lbpCtx, | |
utf32_t | ch, | |||
const char * | lang | |||
) |
Initializes line breaking context for a given language.
[in,out] | lbpCtx | pointer to the line breaking context |
[in] | ch | the first character to process |
[in] | lang | language of the input |
int lb_process_next_char | ( | struct LineBreakContext * | lbpCtx, | |
utf32_t | ch | |||
) |
Updates LineBreakingContext for the next code point and returns the detected break.
[in,out] | lbpCtx | pointer to the line breaking context |
[in] | ch | Unicode code point |
static enum LineBreakClass resolve_lb_class | ( | enum LineBreakClass | lbc, | |
const char * | lang | |||
) | [static] |
Resolves the line breaking class for certain ambiguous or complicated characters.
They are treated in a simplistic way in this implementation.
lbc | line breaking class to resolve | |
lang | language of the text |
void set_linebreaks | ( | const void * | s, | |
size_t | len, | |||
const char * | lang, | |||
char * | brks, | |||
get_next_char_t | get_next_char | |||
) |
Sets the line breaking information for a generic input string.
[in] | s | input string |
[in] | len | length of the input |
[in] | lang | language of the input |
[out] | brks | pointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK, or LINEBREAK_INSIDEACHAR |
[in] | get_next_char | function to get the next UTF-32 character |
void set_linebreaks_utf16 | ( | const utf16_t * | s, | |
size_t | len, | |||
const char * | lang, | |||
char * | brks | |||
) |
Sets the line breaking information for a UTF-16 input string.
[in] | s | input UTF-16 string |
[in] | len | length of the input |
[in] | lang | language of the input |
[out] | brks | pointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK, or LINEBREAK_INSIDEACHAR |
void set_linebreaks_utf32 | ( | const utf32_t * | s, | |
size_t | len, | |||
const char * | lang, | |||
char * | brks | |||
) |
Sets the line breaking information for a UTF-32 input string.
[in] | s | input UTF-32 string |
[in] | len | length of the input |
[in] | lang | language of the input |
[out] | brks | pointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK, or LINEBREAK_INSIDEACHAR |
void set_linebreaks_utf8 | ( | const utf8_t * | s, | |
size_t | len, | |||
const char * | lang, | |||
char * | brks | |||
) |
Sets the line breaking information for a UTF-8 input string.
[in] | s | input UTF-8 string |
[in] | len | length of the input |
[in] | lang | language of the input |
[out] | brks | pointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK, or LINEBREAK_INSIDEACHAR |
static void treat_first_char | ( | struct LineBreakContext * | lbpCtx | ) | [static] |
Treats specially for the first character in a line.
[in,out] | lbpCtx | pointer to the line breaking context |
enum BreakAction baTable[LBP_RI][LBP_RI] [static] |
Break action pair table.
This is a direct mapping of Table 2 of Unicode Standard Annex 14, Revision 30.
struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] [static] |
{ { 0xFFFFFFFF, lb_prop_default } }
Second-level index to the line breaking properties.