class CharDet::UTF1632Prober

Public Class Methods

new() click to toggle source
Calls superclass method CharDet::CharSetProber::new
# File lib/rchardet/utf1632prober.rb, line 34
def initialize
  super()
  @position = 0
  @zeros_at_mod = [0, 0, 0, 0]
  @nonzeros_at_mod = [0, 0, 0, 0]
  @state = EDetecting
  @quad = [0, 0, 0, 0]
  @invalid_utf16be = false
  @invalid_utf16le = false
  @invalid_utf32be = false
  @invalid_utf32le = false
  @first_half_surrogate_pair_detected_16be = false
  @first_half_surrogate_pair_detected_16le = false
  reset()
end

Public Instance Methods

feed(aBuf) click to toggle source
# File lib/rchardet/utf1632prober.rb, line 82
def feed(aBuf)
  aBuf.each_byte do |b|
    mod4 = @position % 4
    @quad[mod4] = b
    if mod4 == 3
      validate_utf32_characters(@quad)
      validate_utf16_characters(@quad[0..2])
      validate_utf16_characters(@quad[2..4])
    end
    if b == 0
      @zeros_at_mod[mod4] += 1
    else
      @nonzeros_at_mod[mod4] += 1
    end
    @position += 1
  end

  return get_state()
end
get_charset_name() click to toggle source
# File lib/rchardet/utf1632prober.rb, line 65
def get_charset_name
  if is_likely_utf32be
    return "UTF-32BE"
  end
  if is_likely_utf32le
    return "UTF-32LE"
  end
  if is_likely_utf16be
    return "UTF-16BE"
  end
  if is_likely_utf16le
    return "UTF-16LE"
  end
  # default to something valid
  return "UTF-16"
end
get_confidence() click to toggle source
# File lib/rchardet/utf1632prober.rb, line 117
def get_confidence
  if is_likely_utf16le || is_likely_utf16be || is_likely_utf32le || is_likely_utf32be
    0.85
  else
    0.00
  end
end
get_state() click to toggle source
# File lib/rchardet/utf1632prober.rb, line 102
def get_state
  if [ENotMe, EFoundIt].include? @state
    # terminal, decided states
    return @state
  end
  if get_confidence > 0.80
    @state = EFoundIt
  elsif @position > 4 * 1024
    # if we get to 4kb into the file, and we can't conclude it's UTF,
    # let's give up
    @state = ENotMe
  end
  return @state
end
reset() click to toggle source
Calls superclass method CharDet::CharSetProber#reset
# File lib/rchardet/utf1632prober.rb, line 50
def reset
  super()
  @position = 0
  @zeros_at_mod = [0, 0, 0, 0]
  @nonzeros_at_mod = [0, 0, 0, 0]
  @state = EDetecting
  @invalid_utf16be = false
  @invalid_utf16le = false
  @invalid_utf32be = false
  @invalid_utf32le = false
  @first_half_surrogate_pair_detected_16be = false
  @first_half_surrogate_pair_detected_16le = false
  @quad = [0, 0, 0, 0]
end

Private Instance Methods

approx_16bit_chars() click to toggle source
# File lib/rchardet/utf1632prober.rb, line 131
def approx_16bit_chars
  return [1.0, @position / 2.0].max
end
approx_32bit_chars() click to toggle source
# File lib/rchardet/utf1632prober.rb, line 127
def approx_32bit_chars
  return [1.0, @position / 4.0].max
end
is_likely_utf16be() click to toggle source
# File lib/rchardet/utf1632prober.rb, line 156
def is_likely_utf16be
  approx_chars = approx_16bit_chars
  return approx_chars >= MIN_CHARS_FOR_DETECTION &&
         (@nonzeros_at_mod[1] + @nonzeros_at_mod[3]) / approx_chars > EXPECTED_RATIO &&
         (@zeros_at_mod[0] + @zeros_at_mod[2]) / approx_chars > EXPECTED_RATIO &&
         !@invalid_utf16be
end
is_likely_utf16le() click to toggle source
# File lib/rchardet/utf1632prober.rb, line 164
def is_likely_utf16le
  approx_chars = approx_16bit_chars
  return approx_chars >= MIN_CHARS_FOR_DETECTION &&
         (@nonzeros_at_mod[0] + @nonzeros_at_mod[2]) / approx_chars > EXPECTED_RATIO &&
         (@zeros_at_mod[1] + @zeros_at_mod[3]) / approx_chars > EXPECTED_RATIO &&
         !@invalid_utf16le
end
is_likely_utf32be() click to toggle source
# File lib/rchardet/utf1632prober.rb, line 135
def is_likely_utf32be
  approx_chars = approx_32bit_chars
  return approx_chars >= MIN_CHARS_FOR_DETECTION &&
         @zeros_at_mod[0] / approx_chars > EXPECTED_RATIO &&
         @zeros_at_mod[1] / approx_chars > EXPECTED_RATIO &&
         @zeros_at_mod[2] / approx_chars > EXPECTED_RATIO &&
         @nonzeros_at_mod[3] / approx_chars > EXPECTED_RATIO &&
         !@invalid_utf32be

end
is_likely_utf32le() click to toggle source
# File lib/rchardet/utf1632prober.rb, line 146
def is_likely_utf32le
  approx_chars = approx_32bit_chars
  return approx_chars >= MIN_CHARS_FOR_DETECTION &&
         @nonzeros_at_mod[0] / approx_chars > EXPECTED_RATIO &&
         @zeros_at_mod[1] / approx_chars > EXPECTED_RATIO &&
         @zeros_at_mod[2] / approx_chars > EXPECTED_RATIO &&
         @zeros_at_mod[3] / approx_chars > EXPECTED_RATIO &&
         !@invalid_utf32le
end
validate_utf16_characters(pair) click to toggle source

@param [Array<Integer>] pair two consecutive bytes @return [void]

# File lib/rchardet/utf1632prober.rb, line 193
def validate_utf16_characters(pair)
  "" "
    Validate if the pair of bytes is  valid UTF-16.

    UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
    with an exception for surrogate pairs, which must be in the range
    0xD800-0xDBFF followed by 0xDC00-0xDFFF

    https://en.wikipedia.org/wiki/UTF-16
    " ""
  if !@first_half_surrogate_pair_detected_16be
    if (0xD8..0xDB).include? pair[0]
      @first_half_surrogate_pair_detected_16be = true
    elsif (0xDC..0xDF).include? pair[0]
      @invalid_utf16be = true
    end
  else
    if (0xDC..0xDF).include? pair[0]
      @first_half_surrogate_pair_detected_16be = false
    else
      @invalid_utf16be = true
    end
  end

  if not @first_half_surrogate_pair_detected_16le
    if (0xD8..0xDB).include? pair[1]
      @first_half_surrogate_pair_detected_16le = true
    elsif (0xDC..0xDF).include? pair[1]
      @invalid_utf16le = true
    end
  else
    if (0xDC..0xDF).include? pair[1]
      @first_half_surrogate_pair_detected_16le = false
    else
      @invalid_utf16le = true
    end
  end
end
validate_utf32_characters(quad) click to toggle source

@param [Array<Integer>] quad four consecutive bytes @return [void]

# File lib/rchardet/utf1632prober.rb, line 174
def validate_utf32_characters(quad)
  "" "
    Validate if the quad of bytes is valid UTF-32.

    UTF-32 is valid in the range 0x00000000 - 0x0010FFFF
    excluding 0x0000D800 - 0x0000DFFF

    https://en.wikipedia.org/wiki/UTF-32
    " ""
  if quad[0] != 0 or quad[1] > 0x10 or quad[0] == 0 and quad[1] == 0 and (0xD8..0xDF).include?(quad[2])
    @invalid_utf32be = true
  end
  if quad[3] != 0 or quad[2] > 0x10 or quad[3] == 0 and quad[2] == 0 and (0xD8..0xDF).include?(quad[1])
    @invalid_utf32le = true
  end
end