DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

Snippets has posted 5883 posts at DZone. View Full User Profile

Punycoded URLs In Ruby

09.26.2007
| 7750 views |
  • submit to reddit
        This is just a proof-of-concept snippet for how to internationalize domain names using <a href="http://raa.ruby-lang.org/project/punycode4r/">punycode4r</a> (sudo gem install punycode4r).

For more information please see:
- <a href="http://en.wikipedia.org/wiki/Punycode">Punycode</a>
- <a href="http://en.wikipedia.org/wiki/Internationalizing_Domain_Names_in_Applications">Internationalized domain name</a>



#!/usr/local/bin/ruby -Ku

# NOTE: The following is not the complete source code by Kazuhiro NISHIYAMA.
#       For the full source code with more features, comments & test cases please see: 
#       open -e `gem environment gemdir`/gems/punycode4r-0.2.0/lib/punycode.rb
#
# This is pure Ruby implementing Punycode (RFC 3492).
# (original ANSI C code (C89) implementing Punycode is in RFC 3492)
#
# copyright (c) 2005 Kazuhiro NISHIYAMA
# You can redistribute it and/or modify it under the same terms as Ruby.


require "unicode"     # sudo gem install unicode

module Punycode

  module Status
    class Error < StandardError; end
    class PunycodeSuccess; end
    # Input is invalid.
    class PunycodeBadInput < Error; end
    # Output would exceed the space provided.
    class PunycodeBigOutput< Error; end
    # Input needs wider integers to process.
    class PunycodeOverflow < Error; end
  end
  include Status


  BASE = 36; TMIN = 1; TMAX = 26; SKEW = 38; DAMP = 700
  INITIAL_BIAS = 72; INITIAL_N = 0x80; DELIMITER = 0x2D

  module_function

  def basic(cp)
    cp < 0x80
  end

  def delim(cp)
    cp == DELIMITER
  end

  def decode_digit(cp)
    cp - 48 < 10 ? cp - 22 :  cp - 65 < 26 ? cp - 65 :
      cp - 97 < 26 ? cp - 97 : BASE
  end

  def encode_digit(d, flag)
    return d + 22 + 75 * ((d < 26) ? 1 : 0) - ((flag ? 1 : 0) << 5)
  end

  def flagged(bcp)
    (0...26) === (bcp - 65)
  end

  def encode_basic(bcp, flag)
    # bcp -= (bcp - 97 < 26) << 5;
    if (0...26) === (bcp - 97)
      bcp -= 1 << 5
    end
    # return bcp + ((!flag && (bcp - 65 < 26)) << 5);
    if !flag and (0...26) === (bcp - 65)
      bcp += 1 << 5
    end
    bcp
  end

  MAXINT = 1 << 64


  def adapt(delta, numpoints, firsttime)
    delta = firsttime ? delta / DAMP : delta >> 1
    delta += delta / numpoints

    k = 0
    while delta > ((BASE - TMIN) * TMAX) / 2
      delta /= BASE - TMIN
      k += BASE
    end

    k + (BASE - TMIN + 1) * delta / (delta + SKEW)
  end

  def punycode_encode(input_length, input, case_flags, output_length, output)

    n = INITIAL_N
    delta = out = 0
    max_out = output_length[0]
    bias = INITIAL_BIAS

    input_length.times do |j|
      if basic(input[j])
        raise PunycodeBigOutput if max_out - out < 2
        output[out] =
          if case_flags
            encode_basic(input[j], case_flags[j])
          else
            input[j]
          end
        out+=1
      # elsif (input[j] < n)
      #   raise PunycodeBadInput
      # (not needed for Punycode with unsigned code points)
      end
    end

    h = b = out

    if b > 0
      output[out] = DELIMITER
      out+=1
    end

   while h < input_length

      m = MAXINT
      input_length.times do |j|
        # next if basic(input[j])
        # (not needed for Punycode)
        m = input[j] if (n...m) === input[j]
      end

      raise PunycodeOverflow if m - n > (MAXINT - delta) / (h + 1)
      delta += (m - n) * (h + 1)
      n = m

      input_length.times do |j|
        # Punycode does not need to check whether input[j] is basic:
        if input[j] < n # || basic(input[j])
          delta+=1
          raise PunycodeOverflow if delta == 0
        end

        if input[j] == n

          q = delta; k = BASE
          while true
            raise PunycodeBigOutput if out >= max_out
            t = if k <= bias # + TMIN # +TMIN not needed
                  TMIN
                elsif k >= bias + TMAX
                  TMAX
                else
                  k - bias
                end
            break if q < t
            output[out] = encode_digit(t + (q - t) % (BASE - t), false)
            out+=1
            q = (q - t) / (BASE - t)
            k += BASE
          end

          output[out] = encode_digit(q, case_flags && case_flags[j])
          out+=1
          bias = adapt(delta, h + 1, h == b)
          delta = 0
          h+=1
        end
      end

      delta+=1; n+=1
    end

    output_length[0] = out
    return PunycodeSuccess
  end

  def punycode_decode(input_length, input, output_length, output, case_flags)

    n = INITIAL_N

    out = i = 0
    max_out = output_length[0]
    bias = INITIAL_BIAS

    b = 0
    input_length.times do |j|
      b = j if delim(input[j])
    end
    raise PunycodeBigOutput if b > max_out

    b.times do |j|
      case_flags[out] = flagged(input[j]) if case_flags
      raise PunycodeBadInput unless basic(input[j])
      output[out] = input[j]
      out+=1
    end

    in_ = b > 0 ? b + 1 : 0
    while in_ < input_length

      oldi = i; w = 1; k = BASE
      while true
        raise PunycodeBadInput if in_ >= input_length
        digit = decode_digit(input[in_])
        in_+=1
        raise PunycodeBadInput if digit >= BASE
        raise PunycodeOverflow if digit > (MAXINT - i) / w
        i += digit * w
        t = if k <= bias # + TMIN # +TMIN not needed
              TMIN
            elsif k >= bias + TMAX
              TMAX
            else
              k - bias
            end
        break if digit < t
        raise PunycodeOverflow if w > MAXINT / (BASE - t)
        w *= BASE - t
        k += BASE
      end

      bias = adapt(i - oldi, out + 1, oldi == 0)

      raise PunycodeOverflow if i / (out + 1) > MAXINT - n
      n += i / (out + 1)
      i %= out + 1

      # not needed for Punycode:
      # raise PUNYCODE_INVALID_INPUT if decode_digit(n) <= base
      raise PunycodeBigOutput if out >= max_out

      if case_flags
        #memmove(case_flags + i + 1, case_flags + i, out - i)
        case_flags[i + 1, out - i] = case_flags[i, out - i]

        # Case of last character determines uppercase flag:
        case_flags[i] = flagged(input[in_ - 1])
      end

      #memmove(output + i + 1, output + i, (out - i) * sizeof *output)
      output[i + 1, out - i] = output[i, out - i]
      output[i] = n
      i+=1

      out+=1
    end

    output_length[0] = out
    return PunycodeSuccess
  end

  def encode(unicode_string, case_flags=nil, print_ascii_only=false)
    input = unicode_string.unpack('U*')
    output = [0] * (ACE_MAX_LENGTH+1)
    output_length = [ACE_MAX_LENGTH]

    punycode_encode(input.size, input, case_flags, output_length, output)

    outlen = output_length[0]
    outlen.times do |j|
      c = output[j]
      unless c >= 0 && c <= 127
        raise Error, "assertion error: invalid output char"
      end
      unless PRINT_ASCII[c]
        raise PunycodeBadInput
      end
      output[j] = PRINT_ASCII[c] if print_ascii_only
    end

    output[0..outlen].map{|x|x.chr}.join('').sub(/\0+\z/, '')
  end

  def decode(punycode, case_flags=[])
    input = []
    output = []

    if ACE_MAX_LENGTH*2 < punycode.size
      raise PunycodeBigOutput
    end
    punycode.each_byte do |c|
      unless c >= 0 && c <= 127
        raise PunycodeBadInput
      end
      input.push(c)
    end

    output_length = [UNICODE_MAX_LENGTH]
    Punycode.punycode_decode(input.length, input, output_length,
                             output, case_flags)
    output.pack('U*')
  end

  UNICODE_MAX_LENGTH = 256
  ACE_MAX_LENGTH = 256

  # The following string is used to convert printable
  # characters between ASCII and the native charset:

  PRINT_ASCII =
    "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" \
    "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" \
    " !\"\#$%&'()*+,-./" \
    "0123456789:;<=>?" \
    "@ABCDEFGHIJKLMNO" \
    "PQRSTUVWXYZ[\\]^_" \
    "`abcdefghijklmno" \
    "pqrstuvwxyz{|}~\n"
end



# cf. http://snippets.dzone.com/posts/show/4527

UTF8REGEX = /\A(?:                                                            
              [\x09\x0A\x0D\x20-\x7E]            # ASCII
            | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
            |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
            | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
            |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
            |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
            | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
            |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
            )*\z/mnx


UTF8_REGEX_MBYTE = /(?:                                 
                 [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
               |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
               | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
               |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
               |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
               | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
               |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
               )/mnx



# cf. http://demo.icu-project.org/icu-bin/idnbrowser (samples)
# on Mac OS X you can check the Ruby conversions with the GUI app PunyCode, http://software.dibomedia.de/products/show/2

str = "http://www.ﺱﺲﺷ.com/"
str = "www.сделат картинки.com"
str = "http://www.сделаткартинки.com/"
str = "http://tūdaliņ.lv/"
str = "http://www.zürich.com/"
str = "http://www.hören.at/"
str = "http://www.žlutý kůň.com/"
str = "www.färgbolaget.nu"
str = "www.brændendekærlighed.com"
str = "www.mäkitorppa.com"
str = "www.färjestadsbk.net"
str = "あーるいん.com"
str = "www.예비교사.com"
str = "www.ハンドボールサムズ.com"
str = "www.日本平.jp"
str = "www.räksmörgås.se"
str = "www.różyczka.pl/"
str = "理容ナカムラ.com"
str = "http://Bücher.ch/"
str = "tūdaliņ.lv"


if str =~ UTF8REGEX && str =~ UTF8_REGEX_MBYTE

   s1 = str.gsub(/^(http:\/\/www\.|http:\/\/|).*?\.[^\.\/]+\/?$/n, '\1')
   s2 = str.gsub(/^(?:http:\/\/www\.|http:\/\/|)(www\.|).*?\.[^\.\/]+\/?$/n, '\1')
   s3 = str.gsub(/^(?:http:\/\/www\.|http:\/\/|www\.|)(.*?)\.[^\.\/]+\/?$/n, '\1')
   s4 = str.gsub(/^(?:http:\/\/www\.|http:\/\/|www\.|).*?(\.[^\.\/]+\/?)$/n, '\1')

   if s1.empty? then s1 = 'http://' end

   s3 = Punycode.encode(Unicode::normalize_KC(Unicode::downcase(s3)))

   punycoded_url = s1 << s2 << "xn--" << s3 << s4

   puts punycoded_url

   %x{ /usr/bin/open "#{punycoded_url}" }

end