class CharDet::UTF1632Prober
Public Class Methods
new()
click to toggle source
Calls superclass method
CharDet::CharSetProber::new
# File lib/rchardet/utf1632prober.rb, line 34 def initialize super() @position = 0 @zeros_at_mod = [0, 0, 0, 0] @nonzeros_at_mod = [0, 0, 0, 0] @state = EDetecting @quad = [0, 0, 0, 0] @invalid_utf16be = false @invalid_utf16le = false @invalid_utf32be = false @invalid_utf32le = false @first_half_surrogate_pair_detected_16be = false @first_half_surrogate_pair_detected_16le = false reset() end
Public Instance Methods
feed(aBuf)
click to toggle source
# File lib/rchardet/utf1632prober.rb, line 82 def feed(aBuf) aBuf.each_byte do |b| mod4 = @position % 4 @quad[mod4] = b if mod4 == 3 validate_utf32_characters(@quad) validate_utf16_characters(@quad[0..2]) validate_utf16_characters(@quad[2..4]) end if b == 0 @zeros_at_mod[mod4] += 1 else @nonzeros_at_mod[mod4] += 1 end @position += 1 end return get_state() end
get_charset_name()
click to toggle source
# File lib/rchardet/utf1632prober.rb, line 65 def get_charset_name if is_likely_utf32be return "UTF-32BE" end if is_likely_utf32le return "UTF-32LE" end if is_likely_utf16be return "UTF-16BE" end if is_likely_utf16le return "UTF-16LE" end # default to something valid return "UTF-16" end
get_confidence()
click to toggle source
# File lib/rchardet/utf1632prober.rb, line 117 def get_confidence if is_likely_utf16le || is_likely_utf16be || is_likely_utf32le || is_likely_utf32be 0.85 else 0.00 end end
get_state()
click to toggle source
# File lib/rchardet/utf1632prober.rb, line 102 def get_state if [ENotMe, EFoundIt].include? @state # terminal, decided states return @state end if get_confidence > 0.80 @state = EFoundIt elsif @position > 4 * 1024 # if we get to 4kb into the file, and we can't conclude it's UTF, # let's give up @state = ENotMe end return @state end
reset()
click to toggle source
Calls superclass method
CharDet::CharSetProber#reset
# File lib/rchardet/utf1632prober.rb, line 50 def reset super() @position = 0 @zeros_at_mod = [0, 0, 0, 0] @nonzeros_at_mod = [0, 0, 0, 0] @state = EDetecting @invalid_utf16be = false @invalid_utf16le = false @invalid_utf32be = false @invalid_utf32le = false @first_half_surrogate_pair_detected_16be = false @first_half_surrogate_pair_detected_16le = false @quad = [0, 0, 0, 0] end
Private Instance Methods
approx_16bit_chars()
click to toggle source
# File lib/rchardet/utf1632prober.rb, line 131 def approx_16bit_chars return [1.0, @position / 2.0].max end
approx_32bit_chars()
click to toggle source
# File lib/rchardet/utf1632prober.rb, line 127 def approx_32bit_chars return [1.0, @position / 4.0].max end
is_likely_utf16be()
click to toggle source
# File lib/rchardet/utf1632prober.rb, line 156 def is_likely_utf16be approx_chars = approx_16bit_chars return approx_chars >= MIN_CHARS_FOR_DETECTION && (@nonzeros_at_mod[1] + @nonzeros_at_mod[3]) / approx_chars > EXPECTED_RATIO && (@zeros_at_mod[0] + @zeros_at_mod[2]) / approx_chars > EXPECTED_RATIO && !@invalid_utf16be end
is_likely_utf16le()
click to toggle source
# File lib/rchardet/utf1632prober.rb, line 164 def is_likely_utf16le approx_chars = approx_16bit_chars return approx_chars >= MIN_CHARS_FOR_DETECTION && (@nonzeros_at_mod[0] + @nonzeros_at_mod[2]) / approx_chars > EXPECTED_RATIO && (@zeros_at_mod[1] + @zeros_at_mod[3]) / approx_chars > EXPECTED_RATIO && !@invalid_utf16le end
is_likely_utf32be()
click to toggle source
# File lib/rchardet/utf1632prober.rb, line 135 def is_likely_utf32be approx_chars = approx_32bit_chars return approx_chars >= MIN_CHARS_FOR_DETECTION && @zeros_at_mod[0] / approx_chars > EXPECTED_RATIO && @zeros_at_mod[1] / approx_chars > EXPECTED_RATIO && @zeros_at_mod[2] / approx_chars > EXPECTED_RATIO && @nonzeros_at_mod[3] / approx_chars > EXPECTED_RATIO && !@invalid_utf32be end
is_likely_utf32le()
click to toggle source
# File lib/rchardet/utf1632prober.rb, line 146 def is_likely_utf32le approx_chars = approx_32bit_chars return approx_chars >= MIN_CHARS_FOR_DETECTION && @nonzeros_at_mod[0] / approx_chars > EXPECTED_RATIO && @zeros_at_mod[1] / approx_chars > EXPECTED_RATIO && @zeros_at_mod[2] / approx_chars > EXPECTED_RATIO && @zeros_at_mod[3] / approx_chars > EXPECTED_RATIO && !@invalid_utf32le end
validate_utf16_characters(pair)
click to toggle source
@param [Array<Integer>] pair two consecutive bytes @return [void]
# File lib/rchardet/utf1632prober.rb, line 193 def validate_utf16_characters(pair) "" " Validate if the pair of bytes is valid UTF-16. UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF with an exception for surrogate pairs, which must be in the range 0xD800-0xDBFF followed by 0xDC00-0xDFFF https://en.wikipedia.org/wiki/UTF-16 " "" if !@first_half_surrogate_pair_detected_16be if (0xD8..0xDB).include? pair[0] @first_half_surrogate_pair_detected_16be = true elsif (0xDC..0xDF).include? pair[0] @invalid_utf16be = true end else if (0xDC..0xDF).include? pair[0] @first_half_surrogate_pair_detected_16be = false else @invalid_utf16be = true end end if not @first_half_surrogate_pair_detected_16le if (0xD8..0xDB).include? pair[1] @first_half_surrogate_pair_detected_16le = true elsif (0xDC..0xDF).include? pair[1] @invalid_utf16le = true end else if (0xDC..0xDF).include? pair[1] @first_half_surrogate_pair_detected_16le = false else @invalid_utf16le = true end end end
validate_utf32_characters(quad)
click to toggle source
@param [Array<Integer>] quad four consecutive bytes @return [void]
# File lib/rchardet/utf1632prober.rb, line 174 def validate_utf32_characters(quad) "" " Validate if the quad of bytes is valid UTF-32. UTF-32 is valid in the range 0x00000000 - 0x0010FFFF excluding 0x0000D800 - 0x0000DFFF https://en.wikipedia.org/wiki/UTF-32 " "" if quad[0] != 0 or quad[1] > 0x10 or quad[0] == 0 and quad[1] == 0 and (0xD8..0xDF).include?(quad[2]) @invalid_utf32be = true end if quad[3] != 0 or quad[2] > 0x10 or quad[3] == 0 and quad[2] == 0 and (0xD8..0xDF).include?(quad[1]) @invalid_utf32le = true end end