我是靠谱客的博主 儒雅哈密瓜,这篇文章主要介绍js实现unicode码字符串与utf8字节数据互转详解,现在分享给大家,希望可以做个参考。

js的string变量存储字符串使用的是unicode编码,要保存时必须选择其他编码后进行传输,比如转成utf-8,utf-32等。存储到数据库中为utf-8编码,读取出来如何转换成正确的字符串就成了问题。现在给出解决方案,可以正确支持中文、emoji表情、英文混合的字符串编码互转。

复制代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
/** * Created by hdwang on 2019/1/28. */ var convertUtf8 = (function() { /** * unicode string to utf-8 * @param text 字符串 * @returns {*} utf-8编码 */ function toBytes(text) { var result = [], i = 0; text = encodeURI(text); while (i < text.length) { var c = text.charCodeAt(i++); // if it is a % sign, encode the following 2 bytes as a hex value if (c === 37) { result.push(parseInt(text.substr(i, 2), 16)) i += 2; // otherwise, just the actual byte } else { result.push(c) } } return coerceArray(result); } /** * utf8 byte to unicode string * @param utf8Bytes * @returns {string} */ function utf8ByteToUnicodeStr(utf8Bytes){ var unicodeStr =""; for (var pos = 0; pos < utf8Bytes.length;){ var flag= utf8Bytes[pos]; var unicode = 0 ; if ((flag >>>7) === 0 ) { unicodeStr+= String.fromCharCode(utf8Bytes[pos]); pos += 1; } else if ((flag &0xFC) === 0xFC ){ unicode = (utf8Bytes[pos] & 0x3) << 30; unicode |= (utf8Bytes[pos+1] & 0x3F) << 24; unicode |= (utf8Bytes[pos+2] & 0x3F) << 18; unicode |= (utf8Bytes[pos+3] & 0x3F) << 12; unicode |= (utf8Bytes[pos+4] & 0x3F) << 6; unicode |= (utf8Bytes[pos+5] & 0x3F); unicodeStr+= String.fromCodePoint(unicode) ; pos += 6; }else if ((flag &0xF8) === 0xF8 ){ unicode = (utf8Bytes[pos] & 0x7) << 24; unicode |= (utf8Bytes[pos+1] & 0x3F) << 18; unicode |= (utf8Bytes[pos+2] & 0x3F) << 12; unicode |= (utf8Bytes[pos+3] & 0x3F) << 6; unicode |= (utf8Bytes[pos+4] & 0x3F); unicodeStr+= String.fromCodePoint(unicode) ; pos += 5; } else if ((flag &0xF0) === 0xF0 ){ unicode = (utf8Bytes[pos] & 0xF) << 18; unicode |= (utf8Bytes[pos+1] & 0x3F) << 12; unicode |= (utf8Bytes[pos+2] & 0x3F) << 6; unicode |= (utf8Bytes[pos+3] & 0x3F); unicodeStr+= String.fromCodePoint(unicode) ; pos += 4; } else if ((flag &0xE0) === 0xE0 ){ unicode = (utf8Bytes[pos] & 0x1F) << 12;; unicode |= (utf8Bytes[pos+1] & 0x3F) << 6; unicode |= (utf8Bytes[pos+2] & 0x3F); unicodeStr+= String.fromCharCode(unicode) ; pos += 3; } else if ((flag &0xC0) === 0xC0 ){ //110 unicode = (utf8Bytes[pos] & 0x3F) << 6; unicode |= (utf8Bytes[pos+1] & 0x3F); unicodeStr+= String.fromCharCode(unicode) ; pos += 2; } else{ unicodeStr+= String.fromCharCode(utf8Bytes[pos]); pos += 1; } } return unicodeStr; } function checkInt(value) { return (parseInt(value) === value); } function checkInts(arrayish) { if (!checkInt(arrayish.length)) { return false; } for (var i = 0; i < arrayish.length; i++) { if (!checkInt(arrayish[i]) || arrayish[i] < 0 || arrayish[i] > 255) { return false; } } return true; } function coerceArray(arg, copy) { // ArrayBuffer view if (arg.buffer && arg.name === 'Uint8Array') { if (copy) { if (arg.slice) { arg = arg.slice(); } else { arg = Array.prototype.slice.call(arg); } } return arg; } // It's an array; check it is a valid representation of a byte if (Array.isArray(arg)) { if (!checkInts(arg)) { throw new Error('Array contains invalid value: ' + arg); } return new Uint8Array(arg); } // Something else, but behaves like an array (maybe a Buffer? Arguments?) if (checkInt(arg.length) && checkInts(arg)) { return new Uint8Array(arg); } throw new Error('unsupported array-like object'); } return { toBytes: toBytes, fromBytes: utf8ByteToUnicodeStr } })()

针对emoji的字节字符,占两个unicode字符。使用String.fromCharCode也可以实现,需要进行两次fromCharCode,没有fromPointCode方便。下面展示了utf-8的4字节转换为unicode(utf-16)的过程。

复制代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
//高char10位[一个unicode字符] (2+6+2=10) unicode = ((utf8Bytes[pos] & 0x3)) << 8 |((utf8Bytes[pos+1] & 0x3f) << 2) |((utf8Bytes[pos+2] >> 4) & 0x03); //减去‭1F600‬中的1,这里减去6个0即可,低位char已经占据10位 unicode = unicode - parseInt('1000000',2) //加上utf-16高char的标识符 unicode = 0xD800 + unicode; console.log(unicode); unicodeStr += String.fromCharCode(unicode); //低char10位[一个unicode字符](4+6) unicode = ((utf8Bytes[pos+2] & 0x0F) << 6) | (utf8Bytes[pos+3] & 0x3F); //加上utf-16低char的标识符 unicode = 0xDC00 + unicode; console.log(unicode); unicodeStr+= String.fromCharCode(unicode); pos += 4;

以上所述是小编给大家介绍的js实现unicode码字符串与utf8字节数据互转详解整合,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对靠谱客网站的支持!

最后

以上就是儒雅哈密瓜最近收集整理的关于js实现unicode码字符串与utf8字节数据互转详解的全部内容,更多相关js实现unicode码字符串与utf8字节数据互转详解内容请搜索靠谱客的其他文章。

本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
点赞(133)

评论列表共有 0 条评论

立即
投稿
返回
顶部