lib/rdoc/encoding.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

# coding: US-ASCII
# frozen_string_literal: true

##
# This class is a wrapper around File IO and Encoding that helps RDoc load
# files and convert them to the correct encoding.

module RDoc::Encoding

  HEADER_REGEXP = /^
    (?:
      \A\#!.*\n
      |
      ^\#\s+frozen[-_]string[-_]literal[=:].+\n
      |
      ^\#[^\n]+\b(?:en)?coding[=:]\s*(?<name>[^\s;]+).*\n
      |
      <\?xml[^?]*encoding=(?<quote>["'])(?<name>.*?)\k<quote>.*\n
    )+
  /xi # :nodoc:

  ##
  # Reads the contents of +filename+ and handles any encoding directives in
  # the file.
  #
  # The content will be converted to the +encoding+.  If the file cannot be
  # converted a warning will be printed and nil will be returned.
  #
  # If +force_transcode+ is true the document will be transcoded and any
  # unknown character in the target encoding will be replaced with '?'

  def self.read_file filename, encoding, force_transcode = false
    content = File.open filename, "rb" do |f| f.read end
    content.gsub!("\r\n", "\n") if RUBY_PLATFORM =~ /mswin|mingw/

    utf8 = content.sub!(/\A\xef\xbb\xbf/, '')

    enc = RDoc::Encoding.detect_encoding content
    content = RDoc::Encoding.change_encoding content, enc if enc

    begin
      encoding ||= Encoding.default_external
      orig_encoding = content.encoding

      if not orig_encoding.ascii_compatible? then
        content = content.encode encoding
      elsif utf8 then
        content = RDoc::Encoding.change_encoding content, Encoding::UTF_8
        content = content.encode encoding
      else
        # assume the content is in our output encoding
        content = RDoc::Encoding.change_encoding content, encoding
      end

      unless content.valid_encoding? then
        # revert and try to transcode
        content = RDoc::Encoding.change_encoding content, orig_encoding
        content = content.encode encoding
      end

      unless content.valid_encoding? then
        warn "unable to convert #{filename} to #{encoding}, skipping"
        content = nil
      end
    rescue Encoding::InvalidByteSequenceError,
           Encoding::UndefinedConversionError => e
      if force_transcode then
        content = RDoc::Encoding.change_encoding content, orig_encoding
        content = content.encode(encoding,
                                 :invalid => :replace,
                                 :undef => :replace,
                                 :replace => '?')
        return content
      else
        warn "unable to convert #{e.message} for #{filename}, skipping"
        return nil
      end
    end

    content
  rescue ArgumentError => e
    raise unless e.message =~ /unknown encoding name - (.*)/
    warn "unknown encoding name \"#{$1}\" for #{filename}, skipping"
    nil
  rescue Errno::EISDIR, Errno::ENOENT
    nil
  end

  def self.remove_frozen_string_literal string
    string =~ /\A(?:#!.*\n)?(.*\n)/
    first_line = $1

    if first_line =~ /\A# +frozen[-_]string[-_]literal[=:].+$/i
      string = string.sub first_line, ''
    end

    string
  end

  ##
  # Detects the encoding of +string+ based on the magic comment

  def self.detect_encoding string
    result = HEADER_REGEXP.match string
    name = result && result[:name]

    name ? Encoding.find(name) : nil
  end

  ##
  # Removes magic comments and shebang

  def self.remove_magic_comment string
    string.sub HEADER_REGEXP do |s|
      s.gsub(/[^\n]/, '')
    end
  end

  ##
  # Changes encoding based on +encoding+ without converting and returns new
  # string

  def self.change_encoding text, encoding
    if text.kind_of? RDoc::Comment
      text.encode! encoding
    else
      String.new text, encoding: encoding
    end
  end

end