#![doc(html_root_url = "https://docs.rs/charset/0.1.2")]
extern crate base64;
extern crate encoding_rs;
#[cfg(feature = "serde")]
extern crate serde;
#[cfg(all(test, feature = "serde"))]
extern crate bincode;
#[cfg(all(test, feature = "serde"))]
#[macro_use]
extern crate serde_derive;
#[cfg(all(test, feature = "serde"))]
extern crate serde_json;
use encoding_rs::CoderResult;
use encoding_rs::Encoding;
use encoding_rs::GB18030;
use encoding_rs::GBK;
use encoding_rs::UTF_16BE;
use std::borrow::Cow;
#[cfg(feature = "serde")]
use serde::de::Visitor;
#[cfg(feature = "serde")]
use serde::{Deserialize, Deserializer, Serialize, Serializer};
pub const UTF_7: Charset = Charset {
variant: VariantCharset::Utf7,
};
pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
encoding_rs::mem::decode_latin1(bytes)
}
pub fn decode_ascii<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
let up_to = Encoding::ascii_valid_up_to(bytes);
if up_to >= bytes.len() {
debug_assert_eq!(up_to, bytes.len());
let s: &str = unsafe { ::std::str::from_utf8_unchecked(bytes) };
return Cow::Borrowed(s);
}
let (head, tail) = bytes.split_at(up_to);
let capacity = head.len() + tail.len() * 3;
let mut vec = Vec::with_capacity(capacity);
vec.extend_from_slice(head);
for &b in tail.into_iter() {
if b < 0x80 {
vec.push(b);
} else {
vec.extend_from_slice("\u{FFFD}".as_bytes());
}
}
Cow::Owned(unsafe { String::from_utf8_unchecked(vec) })
}
#[derive(PartialEq, Debug, Copy, Clone, Hash)]
pub struct Charset {
variant: VariantCharset,
}
impl Charset {
#[inline]
pub fn for_label(label: &[u8]) -> Option<Charset> {
if let Some(encoding) = Encoding::for_label(label) {
Some(Charset::for_encoding(encoding))
} else if is_utf7_label(label) {
Some(UTF_7)
} else {
None
}
}
#[inline]
pub fn for_label_no_replacement(label: &[u8]) -> Option<Charset> {
if let Some(encoding) = Encoding::for_label_no_replacement(label) {
Some(Charset::for_encoding(encoding))
} else if is_utf7_label(label) {
Some(UTF_7)
} else {
None
}
}
#[inline]
pub fn for_encoding(encoding: &'static Encoding) -> Charset {
let enc = if encoding == GBK { GB18030 } else { encoding };
Charset {
variant: VariantCharset::Encoding(enc),
}
}
#[inline]
pub fn for_bom(buffer: &[u8]) -> Option<(Charset, usize)> {
if let Some((encoding, length)) = Encoding::for_bom(buffer) {
Some((Charset::for_encoding(encoding), length))
} else {
None
}
}
pub fn name(self) -> &'static str {
match self.variant {
VariantCharset::Encoding(encoding) => encoding.name(),
VariantCharset::Utf7 => "UTF-7",
}
}
#[inline]
pub fn is_ascii_compatible(self) -> bool {
match self.variant {
VariantCharset::Encoding(encoding) => encoding.is_ascii_compatible(),
VariantCharset::Utf7 => false,
}
}
#[inline]
pub fn decode<'a>(self, bytes: &'a [u8]) -> (Cow<'a, str>, Charset, bool) {
let (charset, without_bom) = match Charset::for_bom(bytes) {
Some((charset, bom_length)) => (charset, &bytes[bom_length..]),
None => (self, bytes),
};
let (cow, had_errors) = charset.decode_without_bom_handling(without_bom);
(cow, charset, had_errors)
}
#[inline]
pub fn decode_with_bom_removal<'a>(self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
match self.variant {
VariantCharset::Encoding(encoding) => encoding.decode_with_bom_removal(bytes),
VariantCharset::Utf7 => decode_utf7(bytes),
}
}
#[inline]
pub fn decode_without_bom_handling<'a>(self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
match self.variant {
VariantCharset::Encoding(encoding) => encoding.decode_without_bom_handling(bytes),
VariantCharset::Utf7 => decode_utf7(bytes),
}
}
}
impl From<&'static Encoding> for Charset {
fn from(encoding: &'static Encoding) -> Self {
Charset::for_encoding(encoding)
}
}
#[cfg(feature = "serde")]
impl Serialize for Charset {
#[inline]
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.serialize_str(self.name())
}
}
#[cfg(feature = "serde")]
struct CharsetVisitor;
#[cfg(feature = "serde")]
impl<'de> Visitor<'de> for CharsetVisitor {
type Value = Charset;
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
formatter.write_str("a valid charset label")
}
fn visit_str<E>(self, value: &str) -> Result<Charset, E>
where
E: serde::de::Error,
{
if let Some(charset) = Charset::for_label(value.as_bytes()) {
Ok(charset)
} else {
Err(E::custom(format!("invalid charset label: {}", value)))
}
}
}
#[cfg(feature = "serde")]
impl<'de> Deserialize<'de> for Charset {
fn deserialize<D>(deserializer: D) -> Result<Charset, D::Error>
where
D: Deserializer<'de>,
{
deserializer.deserialize_str(CharsetVisitor)
}
}
#[inline(never)]
fn is_utf7_label(label: &[u8]) -> bool {
let mut iter = label.into_iter();
loop {
match iter.next() {
None => {
return false;
}
Some(&byte) => match byte {
0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
continue;
}
b'u' | b'U' => {
break;
}
_ => {
return false;
}
},
}
}
let tail = iter.as_slice();
if tail.len() < 4 {
return false;
}
match (tail[0] | 0x20, tail[1] | 0x20, tail[2], tail[3]) {
(b't', b'f', b'-', b'7') => {}
_ => {
return false;
}
}
iter = (&tail[4..]).into_iter();
loop {
match iter.next() {
None => {
return true;
}
Some(&byte) => match byte {
0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
continue;
}
_ => {
return false;
}
},
}
}
}
#[inline]
fn utf7_ascii_up_to(bytes: &[u8]) -> usize {
for (i, &byte) in bytes.into_iter().enumerate() {
if byte == b'+' || byte >= 0x80 {
return i;
}
}
bytes.len()
}
#[inline]
fn utf7_base64_up_to(bytes: &[u8]) -> usize {
for (i, &byte) in bytes.into_iter().enumerate() {
match byte {
b'a'...b'z' | b'A'...b'Z' | b'0'...b'9' | b'+' | b'/' => {}
_ => {
return i;
}
}
}
bytes.len()
}
#[inline]
fn utf7_base64_decode(bytes: &[u8], string: &mut String) -> bool {
let mut decoder = UTF_16BE.new_decoder_without_bom_handling();
let mut buf = [0u8; 60];
let mut tail = bytes;
let mut had_errors = false;
let mut trailing_error = false;
loop {
let (last, mut cap) = if tail.len() <= 80 {
(true, tail.len())
} else {
(false, 80)
};
let len;
loop {
match base64::decode_config_slice(&tail[..cap], base64::STANDARD_NO_PAD, &mut buf[..]) {
Ok(l) => {
len = l;
break;
}
Err(_) => {
assert!(last);
had_errors = true;
trailing_error = true;
tail = &tail[..tail.len() - 1];
cap -= 1;
}
}
}
let mut total_read = 0;
loop {
let (result, read, err) = decoder.decode_to_string(&buf[total_read..len], string, last);
total_read += read;
had_errors |= err;
match result {
CoderResult::InputEmpty => {
if last {
if trailing_error {
string.push_str("\u{FFFD}");
}
return had_errors;
}
break;
}
CoderResult::OutputFull => {
let left = len - total_read;
let needed = decoder.max_utf8_buffer_length(left).unwrap();
string.reserve(needed);
}
}
}
tail = &tail[80..];
}
}
#[inline(never)]
fn decode_utf7<'a>(bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
let up_to = utf7_ascii_up_to(bytes);
if up_to == bytes.len() {
let s: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
return (Cow::Borrowed(s), false);
}
let mut had_errors = false;
let mut out = String::with_capacity(bytes.len());
out.push_str(unsafe { std::str::from_utf8_unchecked(&bytes[..up_to]) });
let mut tail = &bytes[up_to..];
loop {
let first = tail[0];
tail = &tail[1..];
if first == b'+' {
let up_to = utf7_base64_up_to(tail);
had_errors |= utf7_base64_decode(&tail[..up_to], &mut out);
if up_to == tail.len() {
if up_to == 0 {
had_errors = true;
out.push_str("\u{FFFD}");
}
return (Cow::Owned(out), had_errors);
}
if up_to == 0 {
if tail[up_to] == b'-' {
out.push_str("+");
tail = &tail[up_to + 1..];
} else {
had_errors = true;
out.push_str("\u{FFFD}");
}
} else if tail[up_to] == b'-' {
tail = &tail[up_to + 1..];
} else {
tail = &tail[up_to..];
}
} else {
had_errors = true;
out.push_str("\u{FFFD}");
}
let up_to = utf7_ascii_up_to(tail);
out.push_str(unsafe { std::str::from_utf8_unchecked(&tail[..up_to]) });
if up_to == tail.len() {
return (Cow::Owned(out), had_errors);
}
tail = &tail[up_to..];
}
}
#[derive(PartialEq, Debug, Copy, Clone, Hash)]
enum VariantCharset {
Utf7,
Encoding(&'static Encoding),
}
#[cfg(all(test, feature = "serde"))]
#[derive(Serialize, Deserialize, Debug, PartialEq)]
struct Demo {
num: u32,
name: String,
charset: Charset,
}
#[cfg(test)]
mod tests {
use super::*;
fn utf7_no_err(bytes: &[u8]) -> String {
let (cow, had_errors) = UTF_7.decode_without_bom_handling(bytes);
assert!(!had_errors);
cow.into()
}
fn utf7_err(bytes: &[u8]) -> String {
let (cow, had_errors) = UTF_7.decode_without_bom_handling(bytes);
assert!(had_errors);
cow.into()
}
#[test]
fn test_for_label() {
assert_eq!(Charset::for_label(b" uTf-7\t "), Some(UTF_7));
assert_eq!(
Charset::for_label(b" uTf-8\t "),
Some(Charset::for_encoding(encoding_rs::UTF_8))
);
assert_eq!(
Charset::for_label(b" iSo-8859-1\t "),
Some(Charset::for_encoding(encoding_rs::WINDOWS_1252))
);
assert_eq!(
Charset::for_label(b" gb2312\t "),
Some(Charset::for_encoding(encoding_rs::GB18030))
);
assert_eq!(
Charset::for_label(b" ISO-2022-KR\t "),
Some(Charset::for_encoding(encoding_rs::REPLACEMENT))
);
assert_eq!(Charset::for_label(b"u"), None);
assert_eq!(Charset::for_label(b"ut"), None);
assert_eq!(Charset::for_label(b"utf"), None);
assert_eq!(Charset::for_label(b"utf-"), None);
}
#[test]
fn test_for_label_no_replacement() {
assert_eq!(
Charset::for_label_no_replacement(b" uTf-7\t "),
Some(UTF_7)
);
assert_eq!(
Charset::for_label_no_replacement(b" uTf-8\t "),
Some(Charset::for_encoding(encoding_rs::UTF_8))
);
assert_eq!(
Charset::for_label_no_replacement(b" iSo-8859-1\t "),
Some(Charset::for_encoding(encoding_rs::WINDOWS_1252))
);
assert_eq!(
Charset::for_label_no_replacement(b" Gb2312\t "),
Some(Charset::for_encoding(encoding_rs::GB18030))
);
assert_eq!(Charset::for_label_no_replacement(b" ISO-2022-KR\t "), None);
assert_eq!(Charset::for_label_no_replacement(b"u"), None);
assert_eq!(Charset::for_label_no_replacement(b"ut"), None);
assert_eq!(Charset::for_label_no_replacement(b"utf"), None);
assert_eq!(Charset::for_label_no_replacement(b"utf-"), None);
}
#[test]
fn test_for_label_and_name() {
assert_eq!(Charset::for_label(b" uTf-7\t ").unwrap().name(), "UTF-7");
assert_eq!(Charset::for_label(b" uTf-8\t ").unwrap().name(), "UTF-8");
assert_eq!(
Charset::for_label(b" Gb2312\t ").unwrap().name(),
"gb18030"
);
}
#[test]
fn test_utf7_decode() {
assert_eq!(utf7_no_err(b""), "");
assert_eq!(utf7_no_err(b"ab"), "ab");
assert_eq!(utf7_no_err(b"+-"), "+");
assert_eq!(utf7_no_err(b"a+-b"), "a+b");
assert_eq!(utf7_no_err(b"+ACs-"), "+");
assert_eq!(utf7_no_err(b"+AGEAKwBi-"), "a+b");
assert_eq!(utf7_no_err(b"+JgM-"), "\u{2603}");
assert_eq!(utf7_no_err(b"+JgM."), "\u{2603}.");
assert_eq!(utf7_no_err(b"+JgM "), "\u{2603} ");
assert_eq!(utf7_no_err(b"+JgM--"), "\u{2603}-");
assert_eq!(utf7_no_err(b"+JgM"), "\u{2603}");
assert_eq!(utf7_no_err(b"+JgMmAw-"), "\u{2603}\u{2603}");
assert_eq!(utf7_no_err(b"+JgMmAw."), "\u{2603}\u{2603}.");
assert_eq!(utf7_no_err(b"+JgMmAw "), "\u{2603}\u{2603} ");
assert_eq!(utf7_no_err(b"+JgMmAw--"), "\u{2603}\u{2603}-");
assert_eq!(utf7_no_err(b"+JgMmAw"), "\u{2603}\u{2603}");
assert_eq!(utf7_no_err(b"+2D3cqQ-"), "\u{1F4A9}");
assert_eq!(utf7_no_err(b"+2D3cqQ."), "\u{1F4A9}.");
assert_eq!(utf7_no_err(b"+2D3cqQ "), "\u{1F4A9} ");
assert_eq!(utf7_no_err(b"+2D3cqQ--"), "\u{1F4A9}-");
assert_eq!(utf7_no_err(b"+2D3cqQ"), "\u{1F4A9}");
assert_eq!(utf7_no_err(b"+JgPYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp"), "\u{2603}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}");
assert_eq!(utf7_err(b"+"), "\u{FFFD}");
assert_eq!(utf7_err(b"+J-"), "\u{FFFD}");
assert_eq!(utf7_err(b"+Jg-"), "\u{FFFD}");
assert_eq!(utf7_err(b"+J"), "\u{FFFD}");
assert_eq!(utf7_err(b"+Jg"), "\u{FFFD}");
assert_eq!(utf7_err(b"+."), "\u{FFFD}.");
assert_eq!(utf7_err(b"+J."), "\u{FFFD}.");
assert_eq!(utf7_err(b"+Jg."), "\u{FFFD}.");
assert_eq!(utf7_err(b"+ "), "\u{FFFD} ");
assert_eq!(utf7_err(b"+J "), "\u{FFFD} ");
assert_eq!(utf7_err(b"+Jg "), "\u{FFFD} ");
assert_eq!(utf7_err(b"+JgMmA-"), "\u{2603}\u{FFFD}\u{FFFD}");
assert_eq!(utf7_err(b"+JgMmA"), "\u{2603}\u{FFFD}\u{FFFD}");
assert_eq!(utf7_err(b"+JgMmA."), "\u{2603}\u{FFFD}\u{FFFD}.");
assert_eq!(utf7_err(b"+JgMmA "), "\u{2603}\u{FFFD}\u{FFFD} ");
assert_eq!(utf7_err(b"+JgMm-"), "\u{2603}\u{FFFD}");
assert_eq!(utf7_err(b"+JgMm"), "\u{2603}\u{FFFD}");
assert_eq!(utf7_err(b"+JgMm."), "\u{2603}\u{FFFD}.");
assert_eq!(utf7_err(b"+JgMm "), "\u{2603}\u{FFFD} ");
assert_eq!(utf7_err(b"+2D3cq-"), "\u{FFFD}\u{FFFD}");
assert_eq!(utf7_err(b"+2D3cq"), "\u{FFFD}\u{FFFD}");
assert_eq!(utf7_err(b"+2D3cq."), "\u{FFFD}\u{FFFD}.");
assert_eq!(utf7_err(b"+2D3cq "), "\u{FFFD}\u{FFFD} ");
assert_eq!(utf7_err(b"+2D3c-"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2D3c"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2D3c."), "\u{FFFD}.");
assert_eq!(utf7_err(b"+2D3c "), "\u{FFFD} ");
assert_eq!(utf7_err(b"+2D3-"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2D3"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2D3."), "\u{FFFD}.");
assert_eq!(utf7_err(b"+2D3 "), "\u{FFFD} ");
assert_eq!(utf7_err(b"+2D-"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2D"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2D."), "\u{FFFD}.");
assert_eq!(utf7_err(b"+2D "), "\u{FFFD} ");
assert_eq!(utf7_err(b"+2-"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2."), "\u{FFFD}.");
assert_eq!(utf7_err(b"+2 "), "\u{FFFD} ");
assert_eq!(utf7_err(b"+2D0-"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2D0"), "\u{FFFD}");
assert_eq!(utf7_err(b"+2D0."), "\u{FFFD}.");
assert_eq!(utf7_err(b"+2D0 "), "\u{FFFD} ");
assert_eq!(utf7_err(b"+2D0AYQ-"), "\u{FFFD}a");
assert_eq!(utf7_err(b"+2D0AYQ"), "\u{FFFD}a");
assert_eq!(utf7_err(b"+2D0AYQ."), "\u{FFFD}a.");
assert_eq!(utf7_err(b"+2D0AYQ "), "\u{FFFD}a ");
assert_eq!(utf7_err(b"+2D3/QQ-"), "\u{FFFD}\u{FF41}");
assert_eq!(utf7_err(b"+2D3/QQ"), "\u{FFFD}\u{FF41}");
assert_eq!(utf7_err(b"+2D3/QQ."), "\u{FFFD}\u{FF41}.");
assert_eq!(utf7_err(b"+2D3/QQ "), "\u{FFFD}\u{FF41} ");
assert_eq!(utf7_err(b"+AGHcqQ-"), "a\u{FFFD}");
assert_eq!(utf7_err(b"+AGHcqQ"), "a\u{FFFD}");
assert_eq!(utf7_err(b"+AGHcqQ."), "a\u{FFFD}.");
assert_eq!(utf7_err(b"+AGHcqQ "), "a\u{FFFD} ");
}
#[test]
fn test_decode_ascii() {
assert_eq!(decode_ascii(b"aa\x80bb\xFFcc"), "aa\u{FFFD}bb\u{FFFD}cc");
}
#[test]
fn test_from() {
let _: Charset = encoding_rs::UTF_8.into();
}
#[cfg(feature = "serde")]
#[test]
fn test_serde_utf7() {
let demo = Demo {
num: 42,
name: "foo".into(),
charset: UTF_7,
};
let serialized = serde_json::to_string(&demo).unwrap();
let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
assert_eq!(deserialized, demo);
let bincoded = bincode::serialize(&demo, bincode::Infinite).unwrap();
let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
assert_eq!(debincoded, demo);
}
#[cfg(feature = "serde")]
#[test]
fn test_serde_utf8() {
let demo = Demo {
num: 42,
name: "foo".into(),
charset: encoding_rs::UTF_8.into(),
};
let serialized = serde_json::to_string(&demo).unwrap();
let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
assert_eq!(deserialized, demo);
let bincoded = bincode::serialize(&demo, bincode::Infinite).unwrap();
let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
assert_eq!(debincoded, demo);
}
}