@@ -83,12 +83,10 @@ impl From<Script> for ScriptExtension {
8383impl TryFrom < ScriptExtension > for Script {
8484 type Error = ( ) ;
8585 fn try_from ( ext : ScriptExtension ) -> Result < Self , ( ) > {
86- if ext. is_common_or_inherited ( ) {
87- if ext. common {
88- Ok ( Script :: Common )
89- } else {
90- Ok ( Script :: Inherited )
91- }
86+ if ext. is_common ( ) {
87+ Ok ( Script :: Common )
88+ } else if ext. is_inherited ( ) {
89+ Ok ( Script :: Inherited )
9290 } else if ext. is_empty ( ) {
9391 Ok ( Script :: Unknown )
9492 } else {
@@ -131,94 +129,88 @@ impl fmt::Display for Script {
131129 }
132130}
133131
134- #[ derive( Clone , Copy , PartialEq , Eq , Hash ) ]
132+ #[ derive( Clone , Copy , PartialEq , Eq , Hash , Debug ) ]
135133#[ non_exhaustive]
136134/// A value for the `Script_Extension` property
137135///
138136/// [`ScriptExtension`] is one or more [`Script`]
139137///
140138/// This is essentially an optimized version of `Vec<Script>` that uses bitfields
141139pub struct ScriptExtension {
142- // A bitset for the first 64 scripts
140+ // A bitset for the first scripts [0..64]
143141 first : u64 ,
144- // A bitset for the scripts 65- 128
142+ // A bitset for the scripts [65.. 128]
145143 second : u64 ,
146- // A bitset for scripts after 128
144+ // A bitset for scripts after [128..NEXT_SCRIPT]
145+ // The last 2 bits represent whether Common and Inherited is included
146+ // * Bit 63 indicates whether it includes Common
147+ // * Bit 64 indicates whether it includes Inherited
147148 third : u64 ,
148- // Both Common and Inherited are represented by all used bits being set,
149- // this flag lets us distinguish the two.
150- common : bool ,
151149}
152150
153151impl ScriptExtension {
154152 // We don't use the complete u64 of `third`, so the "all" value is not just u32::MAX
155153 // Instead, we take the number of the next (unused) script bit, subtract 128 to bring
156154 // it in the range of `third`, create a u64 with just that bit set, and subtract 1
157155 // to create one with all the lower bits set.
158- const THIRD_MAX : u64 = ( ( 1 << ( NEXT_SCRIPT - 128 ) ) - 1 ) ;
156+ const _CHECK: ( ) = assert ! ( NEXT_SCRIPT - 128 < 63 ) ;
157+ const COMMON_MASK : u64 = ( 1 << 62 ) ; // 63rd bit
158+ const INHERITED_MASK : u64 = ( 1 << 63 ) ; // 64th bit
159159
160160 pub ( crate ) const fn new ( first : u64 , second : u64 , third : u64 ) -> Self {
161161 ScriptExtension {
162162 first,
163163 second,
164164 third,
165- common : false ,
166165 }
167166 }
168167
168+ /// Returns a ScriptExtension containing only Common.
169169 pub ( crate ) const fn new_common ( ) -> Self {
170170 ScriptExtension {
171- first : u64:: MAX ,
172- second : u64:: MAX ,
173- third : Self :: THIRD_MAX ,
174- common : true ,
171+ first : 0 ,
172+ second : 0 ,
173+ third : Self :: COMMON_MASK ,
175174 }
176175 }
177176
177+ /// Returns a ScriptExtension containing only Inherited.
178178 pub ( crate ) const fn new_inherited ( ) -> Self {
179179 ScriptExtension {
180- first : u64:: MAX ,
181- second : u64:: MAX ,
182- third : Self :: THIRD_MAX ,
183- common : false ,
180+ first : 0 ,
181+ second : 0 ,
182+ third : Self :: INHERITED_MASK ,
184183 }
185184 }
186185
186+ /// Returns an empty ScriptExtension
187187 pub ( crate ) const fn new_unknown ( ) -> Self {
188188 ScriptExtension {
189189 first : 0 ,
190190 second : 0 ,
191191 third : 0 ,
192- common : false ,
193192 }
194193 }
195194
196- const fn is_common_or_inherited ( self ) -> bool {
197- ( self . first == u64:: MAX ) & ( self . second == u64:: MAX ) & ( self . third == Self :: THIRD_MAX )
198- }
199-
200195 /// Checks if the script extension is Common
201196 pub const fn is_common ( self ) -> bool {
202- self . is_common_or_inherited ( ) & self . common
197+ ( self . third & Self :: COMMON_MASK ) != 0
203198 }
204199
205200 /// Checks if the script extension is Inherited
206201 pub const fn is_inherited ( self ) -> bool {
207- self . is_common_or_inherited ( ) & ! self . common
202+ ( self . third & Self :: INHERITED_MASK ) != 0
208203 }
209204
210205 /// Checks if the script extension is empty (unknown)
211206 pub const fn is_empty ( self ) -> bool {
212207 ( self . first == 0 ) & ( self . second == 0 ) & ( self . third == 0 )
213208 }
214209
215- /// Returns the number of scripts in the script extension
210+ /// Returns the number of scripts in the script extension. Common and
211+ /// Inherited, if present, are included and counted independently in the return value.
216212 pub fn len ( self ) -> usize {
217- if self . is_common_or_inherited ( ) {
218- 1
219- } else {
220- ( self . first . count_ones ( ) + self . second . count_ones ( ) + self . third . count_ones ( ) ) as usize
221- }
213+ ( self . first . count_ones ( ) + self . second . count_ones ( ) + self . third . count_ones ( ) ) as usize
222214 }
223215
224216 /// Intersect this `ScriptExtension` with another `ScriptExtension`. Produces `Unknown` if things
@@ -233,54 +225,47 @@ impl ScriptExtension {
233225
234226 /// Find the intersection between two ScriptExtensions. Returns Unknown if things
235227 /// do not intersect.
236- ///
237- /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
238- /// everything, the intersection of `Common` and `Inherited` is `Inherited`
239228 pub const fn intersection ( self , other : Self ) -> Self {
240229 let first = self . first & other. first ;
241230 let second = self . second & other. second ;
242231 let third = self . third & other. third ;
243- let common = self . common & other. common ;
244232 ScriptExtension {
245233 first,
246234 second,
247235 third,
248- common,
249236 }
250237 }
251238
252239 /// Find the union between two ScriptExtensions.
253- ///
254- /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
255- /// everything, the union of `Common` and `Inherited` is `Common`
256240 pub const fn union ( self , other : Self ) -> Self {
257241 let first = self . first | other. first ;
258242 let second = self . second | other. second ;
259243 let third = self . third | other. third ;
260- let common = self . common | other. common ;
261244 ScriptExtension {
262245 first,
263246 second,
264247 third,
265- common,
266248 }
267249 }
268250
251+ /// Returns true if and only if all members of `self` are present in `other`.
252+ pub fn is_subset_or_equal ( self , other : Self ) -> bool {
253+ self . intersection ( other) == self && self . union ( other) == other
254+ }
255+
269256 /// Check if this ScriptExtension contains the given script
270- ///
271- /// Should be used with specific scripts only, this will
272- /// return `true` if `self` is not `Unknown` and `script` is
273- /// `Common` or `Inherited`
274257 pub fn contains_script ( self , script : Script ) -> bool {
275258 !self . intersection ( script. into ( ) ) . is_empty ( )
276259 }
277260
278- /// Get the intersection of script extensions of all characters
279- /// in a string.
261+ /// Get the script extension representing the union of all scripts for
262+ /// the characters in a string.
263+ ///
264+ /// This is likely to decay to Unknown. You probably want to use `for_str_union()` instead.
280265 pub fn for_str ( x : & str ) -> Self {
281- let mut ext = ScriptExtension :: default ( ) ;
266+ let mut ext = ScriptExtension :: new_unknown ( ) ;
282267 for ch in x. chars ( ) {
283- ext. intersect_with ( ch. into ( ) ) ;
268+ ext = ext . union ( ch. into ( ) ) ;
284269 }
285270 ext
286271 }
@@ -311,33 +296,23 @@ impl From<&'_ str> for ScriptExtension {
311296 }
312297}
313298
314- impl fmt:: Debug for ScriptExtension {
315- fn fmt ( & self , f : & mut fmt:: Formatter ) -> fmt:: Result {
316- write ! ( f, "ScriptExtension(" ) ?;
317- fmt:: Display :: fmt ( self , f) ?;
318- write ! ( f, ")" )
319- }
320- }
321-
322299impl fmt:: Display for ScriptExtension {
323300 fn fmt ( & self , f : & mut fmt:: Formatter ) -> fmt:: Result {
324- if self . is_common ( ) {
325- write ! ( f, "Common" ) ?;
326- } else if self . is_inherited ( ) {
327- write ! ( f, "Inherited" ) ?;
328- } else if self . is_empty ( ) {
301+ write ! ( f, "ScriptExtension(" ) ?;
302+ if self . is_empty ( ) {
329303 write ! ( f, "Unknown" ) ?;
330304 } else {
331305 let mut first = true ;
332306 for script in self . iter ( ) {
333- if !first {
334- write ! ( f, " + " ) ?;
307+ if first {
335308 first = false ;
309+ } else {
310+ write ! ( f, " + " ) ?;
336311 }
337312 script. full_name ( ) . fmt ( f) ?;
338313 }
339314 }
340- Ok ( ( ) )
315+ write ! ( f , ")" )
341316 }
342317}
343318
@@ -361,7 +336,7 @@ impl UnicodeScript for char {
361336
362337/// Iterator over scripts in a [ScriptExtension].
363338///
364- /// Can be obtained ia [ScriptExtension::iter()]
339+ /// Can be obtained via [ScriptExtension::iter()]
365340pub struct ScriptIterator {
366341 ext : ScriptExtension ,
367342}
@@ -370,26 +345,31 @@ impl Iterator for ScriptIterator {
370345 type Item = Script ;
371346
372347 fn next ( & mut self ) -> Option < Script > {
373- if self . ext . is_common_or_inherited ( ) {
374- let common = self . ext . common ;
375- self . ext = ScriptExtension :: new_unknown ( ) ;
376- if common {
377- Some ( Script :: Common )
378- } else {
379- Some ( Script :: Inherited )
380- }
348+ if self . ext . is_inherited ( ) {
349+ // If `self.ext` is both Inherited and Common, this
350+ // temporarily constructs an invalid ScriptExtension. We don't
351+ // use `self.ext` for anything other than iterating over bits,
352+ // so this is okay.
353+ self . ext . third &= !ScriptExtension :: INHERITED_MASK ;
354+ Some ( Script :: Inherited )
355+ } else if self . ext . is_common ( ) {
356+ self . ext . third &= !ScriptExtension :: COMMON_MASK ;
357+ Some ( Script :: Common )
358+
381359 // Are there bits left in the first chunk?
382360 } else if self . ext . first != 0 {
383361 // Find the next bit
384362 let bit = self . ext . first . trailing_zeros ( ) ;
385363 // unset just that bit
386364 self . ext . first &= !( 1 << bit) ;
387365 Some ( Script :: for_integer ( bit as u8 ) )
366+
388367 // Are there bits left in the second chunk?
389368 } else if self . ext . second != 0 {
390369 let bit = self . ext . second . trailing_zeros ( ) ;
391370 self . ext . second &= !( 1 << bit) ;
392371 Some ( Script :: for_integer ( 64 + bit as u8 ) )
372+
393373 // Are there bits left in the third chunk?
394374 } else if self . ext . third != 0 {
395375 let bit = self . ext . third . trailing_zeros ( ) ;
@@ -429,8 +409,8 @@ mod tests {
429409 seen_scripts. insert ( script) ;
430410 seen_exts. insert ( ext) ;
431411 assert_eq ! ( script as u8 , bit) ;
432- assert ! ( ! ScriptExtension :: new_common( ) . intersection( ext) . is_empty( ) ) ;
433- assert ! ( ! ScriptExtension :: new_inherited( )
412+ assert ! ( ScriptExtension :: new_common( ) . intersection( ext) . is_empty( ) ) ;
413+ assert ! ( ScriptExtension :: new_inherited( )
434414 . intersection( ext)
435415 . is_empty( ) ) ;
436416 assert ! ( ScriptExtension :: new_unknown( ) . intersection( ext) . is_empty( ) ) ;
@@ -443,13 +423,13 @@ mod tests {
443423 fn test_specific ( ) {
444424 let s = "सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे." ;
445425 let ext = ScriptExtension :: for_str ( s) ;
446- assert_eq ! ( ext , script_extensions:: DEVA ) ;
426+ assert ! ( script_extensions:: DEVA . is_subset_or_equal ( ext ) ) ;
447427 println ! (
448- "{:? }" ,
428+ "{}" ,
449429 script_extensions:: DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
450430 ) ;
451431 println ! (
452- "{:? }" ,
432+ "{}" ,
453433 ext. intersection(
454434 script_extensions:: DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
455435 )
@@ -461,7 +441,9 @@ mod tests {
461441 let u = ext. union ( Script :: Dogra . into ( ) ) ;
462442 assert_eq ! (
463443 u. intersection(
464- script_extensions:: DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
444+ script_extensions:: COMMON . union (
445+ script_extensions:: DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
446+ )
465447 ) ,
466448 u
467449 ) ;
@@ -499,6 +481,68 @@ mod tests {
499481 assert ! ( scr. is_err( ) ) ;
500482 }
501483
484+ #[ test]
485+ fn test_subsets_and_iter ( ) {
486+ let cases: & [ ( ScriptExtension , & [ Script ] ) ] = & [
487+ ( ScriptExtension :: new_inherited ( ) , & [ Script :: Inherited ] ) ,
488+ ( ScriptExtension :: new_common ( ) , & [ Script :: Common ] ) ,
489+ (
490+ ScriptExtension :: new_inherited ( ) . union ( script_extensions:: COMMON ) ,
491+ & [ Script :: Inherited , Script :: Common ] ,
492+ ) ,
493+ (
494+ ScriptExtension :: new_inherited ( )
495+ . union ( script_extensions:: COMMON )
496+ . union ( script_extensions:: LATIN ) ,
497+ & [ Script :: Inherited , Script :: Common , Script :: Latin ] ,
498+ ) ,
499+ (
500+ ScriptExtension :: new_inherited ( )
501+ . union ( script_extensions:: COMMON )
502+ . union ( script_extensions:: LATIN )
503+ . union ( script_extensions:: CYRILLIC ) ,
504+ & [
505+ Script :: Inherited ,
506+ Script :: Common ,
507+ Script :: Cyrillic ,
508+ Script :: Latin ,
509+ ] ,
510+ ) ,
511+ ] ;
512+ for & ( full_extension, component_scripts) in cases {
513+ for & script in component_scripts. iter ( ) {
514+ assert ! ( full_extension. contains_script( script) ) ;
515+ let cur = script. into ( ) ;
516+ let intersect = full_extension. intersection ( cur) ;
517+ let union = full_extension. union ( cur) ;
518+ assert_eq ! ( intersect, cur) ;
519+ assert_eq ! ( union , full_extension) ;
520+
521+ assert ! ( cur. is_subset_or_equal( cur) ) ;
522+ assert ! ( cur. is_subset_or_equal( intersect) ) ;
523+ assert ! ( cur. is_subset_or_equal( full_extension) ) ;
524+ assert ! ( cur. is_subset_or_equal( union ) ) ;
525+ if component_scripts. len ( ) > 1 {
526+ assert ! ( !full_extension. is_subset_or_equal( cur) ) ;
527+ assert ! ( !union . is_subset_or_equal( cur) ) ;
528+ }
529+
530+ assert ! ( intersect. is_subset_or_equal( intersect) ) ;
531+ assert ! ( intersect. is_subset_or_equal( full_extension) ) ;
532+ assert ! ( intersect. is_subset_or_equal( union ) ) ;
533+ if component_scripts. len ( ) > 1 {
534+ assert ! ( !full_extension. is_subset_or_equal( intersect) ) ;
535+ assert ! ( !union . is_subset_or_equal( intersect) ) ;
536+ }
537+
538+ assert ! ( union . is_subset_or_equal( union ) ) ;
539+ }
540+ let scripts = component_scripts. iter ( ) . cloned ( ) . collect :: < Vec < _ > > ( ) ;
541+ let scripts_iterated = full_extension. iter ( ) . collect :: < Vec < _ > > ( ) ;
542+ assert_eq ! ( scripts, scripts_iterated) ;
543+ }
544+ }
545+
502546 #[ cfg( feature = "bench" ) ]
503547 #[ bench]
504548 fn bench_script_intersection ( b : & mut Bencher ) {
0 commit comments