Module std::string::unicode

Utilities for handling Unicode text

Structs

CharsIterator

Iterator over Unicode scalar values in a UTF-8 string.

CharIndicesIterator

Iterator over indices of Unicode scalar values in a UTF-8 string.

Utf8Adapter

Functions

fn len_utf8(code: u32) -> usize

source

Returns the number of bytes needed to encode the given Unicode code point as UTF-8.
fn len_utf16(code: u32) -> usize

source

Returns the number of 16-bit values needed to encode the given Unicode code point as UTF-16.
fn is_scalar_value(code: u32) -> bool

source
Returns true if the Unicode code point is a valid Unicode scalar value.
A Unicode scalar value is a Unicode code point in the range U+0000..U+D7FF or U+E000..U+10FFFF.

Example
```
use std::string::unicode::is_scalar_value;

assert!(is_scalar_value(0x61));
assert!(!is_scalar_value(0xD800));
assert!(!is_scalar_value(0x1FFFFF));
Run this example
```
fn is_utf16_surrogate(code: u16) -> bool

source

Returns true if the 16-bit value is a surrogate (either high or low).
fn encode_utf8(code: u32, dst: &mut [u8]) -> Result<usize, Error>

source
Encodes the given Unicode code point as UTF-8 into the given byte buffer. Returns the number of bytes written.
Returns an error if the buffer is too small or if the code point is not a valid Unicode scalar value.

Example
```
use std::string::unicode::encode_utf8;

let buf: [u8; 10];
let size = encode_utf8(0x1F4A9, &buf).unwrap();

assert_eq!(buf[..size] as &[u8], "💩");
Run this example
```
fn encode_utf16(code: u32, dst: &mut [u16]) -> Result<usize, Error>

source
Encodes the given Unicode code point as UTF-16 into the given 16-bit value buffer. Returns the number of values written.
Returns an error if the buffer is too small or if the code point is not a valid Unicode scalar value.

Example
```
use std::string::unicode::encode_utf16;

let buf: [u16; 10];
let size = encode_utf16(0x1F4A9, &buf).unwrap();

assert_eq!(size, 2);
assert_eq!(buf[0], 0xD83D);
assert_eq!(buf[1], 0xDCA9);
Run this example
```
fn decode_utf8(src: &[u8]) -> Result<(u32, usize), Error>

source
Decodes a single UTF-8 code point from the given byte slice.
Returns the code point and the number of bytes read. This method ensures validates that the returned code point is a valid Unicode scalar value and it that it is encoded with the minimum number of bytes.
See also decode_utf8_unchecked which does not perform any validation, but can be faster.

Examples
```
use std::string::unicode::decode_utf8;

let s = "😊";
let (code, len) = decode_utf8(s).unwrap();

assert_eq!(code, 0x1F60A);
assert_eq!(len, 4);
Run this example
```
```
use std::string::unicode::decode_utf8;

let s = "\xed\xa0\x80"; // pseudo-UTF8 representing U+D800 surrogate
decode_utf8(s).unwrap(); // panics
Run this example
```

fn decode_utf8_unchecked(src: &[u8]) -> (u32, usize)

source

Decodes a single UTF-8 code point from the given byte slice without any validation.

Returns the code point and the number of bytes read. This method does not validate that the returned code point is a valid Unicode scalar value or that it is encoded with the minimum number of bytes.

Examples

use std::string::unicode::decode_utf8_unchecked;

let s = "\xed\xa0\x80"; // pseudo-UTF8 representing U+D800 surrogate
let (code, len) = decode_utf8_unchecked(s);

assert_eq!(code, 0xD800);
assert_eq!(len, 3);
Run this example

use std::string::unicode::decode_utf8_unchecked;

let s = "\xed"; // truncated UTF-8 sequence
// panics or UBs depending on whether bound checks are enabled
decode_utf8_unchecked(s);
Run this example

fn decode_utf16(src: &[u16]) -> Result<(u32, usize), Error>

source

fn chars(self: &[u8]) -> CharsIterator

source

Iterator over Unicode codepoints of a UTF-8 string.

If the string is not valid UTF-8, then the iterator will return an error when an invalid sequence is encountered.

Examples

use std::string::unicode::chars;
use std::string::unicode::Error;

let ascii = "hello"
    .chars()
    .map(Result::unwrap::<u32, Error>)
    .to_vector();
defer ascii.free();

assert_eq!(ascii[..], &[
    'h' as u32,
    'e' as u32,
    'l' as u32,
    'l' as u32,
    'o' as u32
]);

// codepoint != grapheme cluster
let face_in_cloud = "😶‍🌫️"
    .chars()
    .map(Result::unwrap::<u32, Error>)
    .to_vector();
defer face_in_cloud.free();

assert_eq!(face_in_cloud[..], &[
    0x1F636, //  Face Without Mouth
    0x200D,  //  Zero Width Joiner
    0x1F32B, //  Fog
    0xFE0F   //  Variation Selector-16
]);
Run this example

When invalid UTF-8 is encountered, the iterator will return an error for each byte that is not a start of a valid sequence, but still advance. This allows for resynchronization of the stream.

use std::string::unicode::chars;
use std::string::unicode::Error;

let it = "a\xed\xa0\x80b" // pseudo-UTF-8 encoding of a surrogate
    .chars();

assert_eq!(it.next(), Option::some(Result::ok('a' as u32)));
assert_eq!(it.next(), Option::some(Result::err(Error::InvalidUTF8)));
assert_eq!(it.next(), Option::some(Result::err(Error::InvalidUTF8)));
assert_eq!(it.next(), Option::some(Result::err(Error::InvalidUTF8)));
assert_eq!(it.next(), Option::some(Result::ok('b' as u32)));
assert_eq!(it.next(), Option::none());
Run this example

fn char_indices(self: &[u8]) -> CharIndicesIterator

source

Iterator over byte indices of Unicode codepoints of a UTF-8 string.

If the string is not valid UTF-8, then the iterator will return an error when an invalid sequence is encountered.

Example

use std::string::unicode::char_indices;
use std::string::unicode::Error;

let text = "chữ Quốc ngữ"
    .char_indices()
    .map(Result::unwrap::<(usize, u32), Error>)
    .to_vector();
defer text.free();

assert_eq!(text[..], &[
    (0,  'c' as u32),
    (1,  'h' as u32),
    (2,      0x1EEF),  // ữ
    (5,  ' ' as u32),
    (6,  'Q' as u32),
    (7,  'u' as u32),
    (8,      0x1ED1),  // ố
    (11, 'c' as u32),
    (12, ' ' as u32),
    (13, 'n' as u32),
    (14, 'g' as u32),
    (15,     0x1EEF),  // ữ
]);
Run this example

fn utf8_chars<It>(iter: &mut It) -> Utf8Adapter<It>
It: Iterator<It, u32>

source

Returns a formattable object that writes a sequence of Unicode codepoints as a UTF-8 encoded string.

It accepts an iterator over u32 values. Panics if the iterator yields a value that is not a valid Unicode scalar value.

Example

use std::string::unicode::utf8_chars;

let seq = [
    0x1f4a9u32,
    '=' as u32,
    0x1fa99,
    ' ' as u32,
    '&' as u32,
    ' ' as u32,
    0x1fa99,
    '=' as u32,
    0x1f4a9
];

// https://sl.wikisource.org/wiki/Kons._5
println!("{}", seq.iter().utf8_chars()); // prints "💩=🪙 & 💩=🪙"
Run this example

Enums

Error

Error type for Unicode conversion

Consts

const TAG_CONT: u8 = /* ... */

source
const TAG_TWO_B: u8 = /* ... */

source
const TAG_THREE_B: u8 = /* ... */

source
const TAG_FOUR_B: u8 = /* ... */

source
const MAX_ONE_B: u32 = /* ... */

source
const MAX_TWO_B: u32 = /* ... */

source
const MAX_THREE_B: u32 = /* ... */

source

Module std::string::unicode

Structs

Functions

fn len_utf8(code: u32) -> usize

fn len_utf16(code: u32) -> usize

fn is_scalar_value(code: u32) -> bool

Example

fn is_utf16_surrogate(code: u16) -> bool

fn encode_utf8(code: u32, dst: &mut [u8]) -> Result<usize, Error>

Example

fn encode_utf16(code: u32, dst: &mut [u16]) -> Result<usize, Error>

Example

fn decode_utf8(src: &[u8]) -> Result<(u32, usize), Error>

Examples

fn decode_utf8_unchecked(src: &[u8]) -> (u32, usize)

Examples

fn decode_utf16(src: &[u16]) -> Result<(u32, usize), Error>

fn chars(self: &[u8]) -> CharsIterator

Examples

fn char_indices(self: &[u8]) -> CharIndicesIterator

Example

fn utf8_chars<It>(iter: &mut It) -> Utf8Adapter<It> It: Iterator<It, u32>

Example

Enums

Consts

const TAG_CONT: u8 = /* ... */

const TAG_TWO_B: u8 = /* ... */

const TAG_THREE_B: u8 = /* ... */

const TAG_FOUR_B: u8 = /* ... */

const MAX_ONE_B: u32 = /* ... */

const MAX_TWO_B: u32 = /* ... */

const MAX_THREE_B: u32 = /* ... */

fn utf8_chars<It>(iter: &mut It) -> Utf8Adapter<It>
It: Iterator<It, u32>