icupy

Python bindings for ICU4C using pybind11.

Changes from ICU4C

Naming Conventions

Renamed functions, methods, and enums to conform to PEP 8.
- Function Names: use lower_case_with_underscores style.
- Method Names: use lower_case_with_underscores style. Also, use one leading underscore only for protected methods.
- C++ Enum Member Names: use UPPER_CASE_WITH_UNDERSCORES style without a leading "k". (e.g., kDateOffset → DATE_OFFSET)
- APIs that match Python reserved words: e.g.,
  - with() → with_()

Error Handling

ICU C/C++ API errors are raised as icupy.icu.ICUError exceptions. The underlying UErrorCode can be retrieved from the error_code attribute of the exception.

For example:

from icupy import icu
try:
    pass  # Call ICU API here...
except icu.ICUError as e:
    print(e.error_code)  # → icu.ErrorCode
    print(e.error_code.get())  # → icu.UErrorCode

Examples

icu::UnicodeString with predefined error callback function

# from Unicode to codepage
from icupy import icu
cnv = icu.ucnv_open("iso8859-1")
context = icu.UserContext(icu.UCNV_ESCAPE_C)  # \uXXXX
action = icu.UConverterFromUCallback(icu.UCNV_FROM_U_CALLBACK_ESCAPE, context)
old_action = icu.ucnv_set_from_u_call_back(cnv, action)
s = icu.UnicodeString("A€B")
s.extract(cnv)  # → b'A\\u20ACB'

# from codepage to Unicode
from icupy import icu
cnv = icu.ucnv_open("Shift-JIS")
context = icu.UserContext(icu.UCNV_ESCAPE_XML_HEX)  # &#xXXXX;
action = icu.UConverterToUCallback(icu.UCNV_TO_U_CALLBACK_ESCAPE, context)
old_action = icu.ucnv_set_to_u_call_back(cnv, action)
src = b"\x61\xeb\x40\x62"  # 0xeb 0x40: UNASSIGNED SEQUENCE
s = icu.UnicodeString(src, -1, cnv)
str(s)  # → 'a&#xEB;&#x40;b'

icu::UnicodeString with custom error callback function

# from Unicode to codepage
from icupy import icu
from icupy.utils import gc
def from_unicode_cb(
    options: object,
    args: icu.UConverterFromUnicodeArgs,
    code_units: str,
    length: int,
    code_point: int,
    reason: icu.UConverterCallbackReason,
    error_code: icu.ErrorCode,
) -> None:
    _ = options, length, code_point  # unused
    if reason in [icu.UCNV_UNASSIGNED, icu.UCNV_ILLEGAL, icu.UCNV_IRREGULAR]:
        error_code.set(icu.U_ZERO_ERROR)
        source = "".join(f"\\u{ord(c):04x}" for c in code_units)
        icu.ucnv_cb_from_u_write_bytes(args, source, len(source), 0)

with gc(icu.ucnv_open("iso8859-1"), icu.ucnv_close) as cnv:
    action = icu.UConverterFromUCallback(from_unicode_cb)
    old_action = icu.ucnv_set_from_u_call_back(cnv, action)
    s = icu.UnicodeString("A€B")
    s.extract(cnv)  # → b'A\\u20acB'

# from codepage to Unicode
from icupy import icu
from icupy.utils import gc
def to_unicode_cb(
    options: object,
    args: icu.UConverterToUnicodeArgs,
    code_units: bytes,
    length: int,
    reason: icu.UConverterCallbackReason,
    error_code: icu.ErrorCode,
) -> None:
    _ = options, length  # unused
    if reason in [icu.UCNV_UNASSIGNED, icu.UCNV_ILLEGAL, icu.UCNV_IRREGULAR]:
        error_code.set(icu.U_ZERO_ERROR)
        source = "".join(f"%{b:02X}" for b in code_units)
        icu.ucnv_cb_to_u_write_uchars(args, source, len(source), 0)

with gc(icu.ucnv_open("Shift-JIS"), icu.ucnv_close) as cnv:
    action = icu.UConverterToUCallback(to_unicode_cb)
    old_action = icu.ucnv_set_to_u_call_back(cnv, action)
    src = b"\x61\xeb\x40\x62"  # 0xeb 0x40: UNASSIGNED SEQUENCE
    s = icu.UnicodeString(src, -1, cnv)
    str(s)  # → 'a%EB%40b'

icu::BreakIterator for word-breaks

from icupy import icu
bi = icu.BreakIterator.create_word_instance("en_US")
src = icu.UnicodeString("Alice was beginning to get very tired of sitting by her sister on the bank.")
bi.set_text(src)
result = []
start = bi.first()
while (end := bi.next()) != icu.UBRK_DONE:
    if bi.get_rule_status() != icu.UBRK_WORD_NONE:
        result.append(src[start:end])
    start = end

# result: ['Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank']

Natural sort (human-friendly sorting)

from icupy import icu
coll = icu.Collator.create_instance("en_US")
coll.set_attribute(icu.UCOL_NUMERIC_COLLATION, icu.UCOL_ON)
data = ["file1.txt", "file10.txt", "file2.txt", "file20.txt", "file3.txt"]
sorted(data, key=coll.get_sort_key)
# ['file1.txt', 'file2.txt', 'file3.txt', 'file10.txt', 'file20.txt']

icu::IDNA (UTS #46)

from icupy import icu
uts46 = icu.IDNA.create_uts46_instance(icu.UIDNA_DEFAULT | icu.UIDNA_CHECK_BIDI | icu.UIDNA_CHECK_CONTEXTJ)
dest = icu.UnicodeString()
info = icu.IDNAInfo()
# a + ZERO WIDTH NON-JOINER + b.com
uts46.name_to_ascii("a\u200cb.com", dest, info)  # → 'xn--ab-j1t.com'
bool(info.get_errors() & icu.UIDNA_ERROR_BIDI)  # → False
bool(info.get_errors() & icu.UIDNA_ERROR_CONTEXTJ)  # → True

icu::number::NumberFormatter (ICU 60+)

from icupy import icu
from icupy.icu import number
template = (
    number.NumberFormatter.with_()
    .notation(number.Notation.compact_short())
    .unit(icu.CurrencyUnit("EUR"))
    .precision(number.Precision.max_significant_digits(2))
)
template.locale("en_US").format_int(1234).to_string()  # "€1.2K" in en-US

icu::RegexMatcher::find with custom callback function

from icupy import icu
src = icu.UnicodeString("aaaaaaaaaaaaaaaaaaab")
matcher = icu.RegexMatcher("((.)\\2)x", src, 0)
def progress_callback(options: dict[str, int], match_index: int) -> bool:
    if not isinstance(options, dict):
        return False
    calls = options.get("numCalls", 0) + 1
    options["numCalls"] = calls
    options["lastIndex"] = match_index
    max_calls = options.get("maxCalls", -1)
    return True if max_calls < 0 else calls < max_calls

info = {}
context = icu.UserContext(info)
callback = icu.URegexFindProgressCallback(progress_callback, context)
matcher.set_find_progress_callback(callback)
matcher.find(0)  # → False
# info: {'numCalls': 18, 'lastIndex': 18}
info.clear()
info["maxCalls"] = 5
matcher.find(0)  # → ICUError: U_REGEX_STOPPED_BY_CALLER
# info: {'maxCalls': 5, 'numCalls': 5, 'lastIndex': 5}

icu::number::SimpleNumberFormatter (ICU 73+)

from icupy import icu
from icupy.icu import number
fmt = number.SimpleNumberFormatter.for_locale_and_grouping_strategy("de-CH", icu.UNUM_GROUPING_ON_ALIGNED)
fmtval = fmt.format_int64(1234567)
fmtval.to_string()  # → "1'234'567"

Subclassing icu::Transliterator

# Uppercase letters while skipping text enclosed in backticks
from icupy import icu
class TestTransliterator(icu.Transliterator):
    def __init__(self, filter_set: icu.UnicodeSet | None = None) -> None:
        icu.Transliterator.__init__(self, "Any-UpperWithoutCode", filter_set)
    def _handle_transliterate(
        self,
        text: icu.Replaceable,
        pos: icu.UTransPosition,
        incremental: bool,
    ) -> None:
        # Implement the transliteration algorithm here.
        cursor = pos.start
        in_backtick = False
        while cursor < pos.limit:
            c = text.char32_at(cursor)
            char_len = icu.u16_length(c)
            if c == 0x60:
                in_backtick = not in_backtick
                cursor += char_len
                continue
            if not in_backtick and icu.u_isalpha(c) and icu.u_islower(c):
                upper = icu.u_toupper(c)
                if upper != c:
                    text.handle_replace_between(cursor, cursor + char_len, chr(upper))
                    char_len = icu.u16_length(upper)
            cursor += char_len
        pos.start = pos.limit

tl = TestTransliterator()
text = icu.UnicodeString("Subclasses must implement `_handle_transliterate()`, which defines their own transliteration algorithm.")
tl.transliterate(text)
# text: "SUBCLASSES MUST IMPLEMENT `_handle_transliterate()`, WHICH DEFINES THEIR OWN TRANSLITERATION ALGORITHM."

Installation

Prerequisites

Python >=3.10
ICU4C (ICU - The International Components for Unicode) (>=70 recommended)
C++17 compatible compiler (see Supported Compilers)
CMake >=3.15

Installing prerequisites

Windows:

Install the following dependencies:
- Python >=3.10
- Pre-built ICU4C binary package (>=70 recommended)
- C++17 compatible compiler. Visual Studio 2022 or newer recommended
- CMake >=3.15
  - Note: Add CMake to the system PATH.
Linux:

To install dependencies, run the following command:
- Ubuntu/Debian:
```
sudo apt install g++ cmake libicu-dev python3-dev python3-pip
```
- Fedora:
```
sudo dnf install gcc-c++ cmake icu libicu-devel python3-devel
```
Note: If your system's ICU is out of date, consider building ICU4C from source or installing pre-built ICU4C binary package.

Installing icupy

Configuring environment variables
- Windows:
  - Set the ICU_ROOT environment variable to the root of the ICU installation.
    
    For example, if the ICU is located in C:\icu4c:
    
    in PowerShell:
```
$env:ICU_ROOT = "C:\icu4c"
```
    or in Command Prompt:
```
set ICU_ROOT=C:\icu4c
```
  - To verify settings using icuinfo (64-bit):
    
    in PowerShell:
```
& $env:ICU_ROOT\bin64\icuinfo
```
    or in Command Prompt:
```
%ICU_ROOT%\bin64\icuinfo
```
- Linux:
  - If the ICU is located in a non-regular place, set the PKG_CONFIG_PATH and LD_LIBRARY_PATH environment variables.
    
    For example, if the ICU is located in /usr/local:
```
export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
```
  - To verify settings using pkg-config:
```
pkg-config --cflags --libs icu-uc
# -I/usr/local/include -L/usr/local/lib -licuuc
```
Installing from PyPI
```
pip install icupy
```
Optionally, CMake environment variables are available. For example, using the Ninja build system and Clang:
```
CMAKE_GENERATOR=Ninja CXX=clang++ pip install icupy
```
Alternatively, installing development version from the git repository:
```
pip install git+https://github.com/miute/icupy.git
```

Usage

Configuring environment variables
- Windows:
  - Set the ICU_ROOT environment variable to the root of the ICU installation (default is C:\icu).
    
    For example, if the ICU is located in C:\icu4c:
    
    in PowerShell:
```
$env:ICU_ROOT = "C:\icu4c"
```
    or in Command Prompt:
```
set ICU_ROOT=C:\icu4c
```
- Linux:
  - If the ICU is located in a non-regular place, set the LD_LIBRARY_PATH environment variables.
    
    For example, if the ICU is located in /usr/local:
```
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
```

Using icupy

import icupy.icu as icu
# or
from icupy import icu

License

This project is licensed under the MIT License.

Name		Name	Last commit message	Last commit date
Latest commit History 613 Commits
.github		.github
src		src
tests		tests
.gitignore		.gitignore
.gitmodules		.gitmodules
.icu-versions.json		.icu-versions.json
.pre-commit-config.yaml		.pre-commit-config.yaml
CHANGELOG.md		CHANGELOG.md
CMakeLists.txt		CMakeLists.txt
LICENSE		LICENSE
README.md		README.md
pyproject.toml		pyproject.toml
tox.ini		tox.ini
uv.lock		uv.lock

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

icupy

Changes from ICU4C

Examples

Installation

Prerequisites

Installing prerequisites

Installing icupy

Usage

License

About

Uh oh!

Releases 9

Uh oh!

Contributors

Uh oh!

Languages

Folders and files

Latest commit

History

Repository files navigation

icupy

Changes from ICU4C

Examples

Installation

Prerequisites

Installing prerequisites

Installing icupy

Usage

License

About

Topics

Resources

License

Uh oh!

Stars

Watchers

Forks

Releases 9

Uh oh!

Contributors

Uh oh!

Languages