src/wordbreak.c File Reference
Implementation of the word breaking algorithm as described in Unicode Standard Annex 29.
More...
#include <assert.h>
#include <stddef.h>
#include <string.h>
#include "unibreakdef.h"
#include "wordbreak.h"
#include "wordbreakdata.c"
Defines |
#define | ARRAY_LEN(x) (sizeof(x) / sizeof(x[0])) |
#define | IS_WB3ab(cls) |
Functions |
void | init_wordbreak (void) |
| Initializes the wordbreak internals.
|
static enum WordBreakClass | get_char_wb_class (utf32_t ch, struct WordBreakProperties *wbp, size_t len) |
| Gets the word breaking class of a character.
|
static void | set_brks_to (const void *s, char *brks, size_t posStart, size_t posEnd, size_t len, char brkType, get_next_char_t get_next_char) |
| Sets the word break types to a specific value in a range.
|
static void | set_wordbreaks (const void *s, size_t len, const char *lang, char *brks, get_next_char_t get_next_char) |
| Sets the word breaking information for a generic input string.
|
void | set_wordbreaks_utf8 (const utf8_t *s, size_t len, const char *lang, char *brks) |
| Sets the word breaking information for a UTF-8 input string.
|
void | set_wordbreaks_utf16 (const utf16_t *s, size_t len, const char *lang, char *brks) |
| Sets the word breaking information for a UTF-16 input string.
|
void | set_wordbreaks_utf32 (const utf32_t *s, size_t len, const char *lang, char *brks) |
| Sets the word breaking information for a UTF-32 input string.
|
Detailed Description
Implementation of the word breaking algorithm as described in Unicode Standard Annex 29.
- Version:
- 3.0, 2015/05/10
- Author:
- Tom Hacohen
Define Documentation
#define ARRAY_LEN |
( |
x |
|
) |
(sizeof(x) / sizeof(x[0])) |
Function Documentation
Gets the word breaking class of a character.
- Parameters:
-
| ch | character to check |
| wbp | pointer to the wbp breaking properties array |
| len | size of the wbp array in number of items |
- Returns:
- the word breaking class if found;
WBP_Any
otherwise
void init_wordbreak |
( |
void |
|
) |
|
Initializes the wordbreak internals.
It currently does nothing, but it may in the future.
static void set_brks_to |
( |
const void * |
s, |
|
|
char * |
brks, |
|
|
size_t |
posStart, |
|
|
size_t |
posEnd, |
|
|
size_t |
len, |
|
|
char |
brkType, |
|
|
get_next_char_t |
get_next_char | |
|
) |
| | [static] |
Sets the word break types to a specific value in a range.
It sets the inside chars to WORDBREAK_INSIDEACHAR and the rest to brkType. Assumes brks is initialized - all the cells with WORDBREAK_NOBREAK are cells that we really don't want to break after.
- Parameters:
-
[in] | s | input string |
[out] | brks | breaks array to fill |
[in] | posStart | start position |
[in] | posEnd | end position (exclusive) |
[in] | len | length of the string |
[in] | brkType | breaks type to use |
[in] | get_next_char | function to get the next UTF-32 character |
static void set_wordbreaks |
( |
const void * |
s, |
|
|
size_t |
len, |
|
|
const char * |
lang, |
|
|
char * |
brks, |
|
|
get_next_char_t |
get_next_char | |
|
) |
| | [static] |
Sets the word breaking information for a generic input string.
- Parameters:
-
void set_wordbreaks_utf16 |
( |
const utf16_t * |
s, |
|
|
size_t |
len, |
|
|
const char * |
lang, |
|
|
char * |
brks | |
|
) |
| | |
Sets the word breaking information for a UTF-16 input string.
- Parameters:
-
void set_wordbreaks_utf32 |
( |
const utf32_t * |
s, |
|
|
size_t |
len, |
|
|
const char * |
lang, |
|
|
char * |
brks | |
|
) |
| | |
Sets the word breaking information for a UTF-32 input string.
- Parameters:
-
void set_wordbreaks_utf8 |
( |
const utf8_t * |
s, |
|
|
size_t |
len, |
|
|
const char * |
lang, |
|
|
char * |
brks | |
|
) |
| | |
Sets the word breaking information for a UTF-8 input string.
- Parameters:
-