#! /usr/bin/ruby -KU #if $KCODE != 'UTF8' # raise "$KCODE must be UTF8" #end HEAD=<$/ chars.shift chars.each do |c| compat << [c.hex].pack("U") end canon = nil else raise "unknown value: " + hex end [canon, compat] end def hex_or_nil(str) return nil if str.nil? || str == '' ret = "" chars = str.split(" ") chars.each do |c| ret << [c.hex].pack("U") end return ret end def printstr(str) return "NULL" if !str ret = "" str.each_byte do |c| if c >= 32 && c < 127 && c != 34 && c != 92 ret << c else ret << format("\\%03o", c) end end return '"' + ret + '"' end if ARGV.length != 4 puts "Usage: #{$0} " exit 0 end ## scan Composition Exclusions exclusion = {} open(ARGV[1]) do |f| while l = f.gets next if l =~ /^\#/ || l =~ /^$/ next if l !~ /Full_Composition_Exclusion/ code, = l.split(/\s/) if code =~ /^[0-9A-F]+$/ code = code.hex exclusion[code] = true elsif code =~ /^([0-9A-F]+)\.\.([0-9A-F]+)$/ # p [$1, $2] scode = $1.hex ecode = $2.hex for code in scode..ecode exclusion[code] = true end end end end ## scan Special Casing casing = {} open(ARGV[2]) do |f| while l = f.gets l.chomp! next if l =~ /^\#/ || l =~ /^$/ l =~ /^(.*)#\s*(.*)$/ l = $1 comment = $2 code,lower,title,upper,cond = l.split(/;\s/) next if cond lower = nil if code == lower title = nil if code == title upper = nil if code == upper code = code.hex casing[code] = [hex_or_nil(lower), hex_or_nil(title), hex_or_nil(upper)] end end ## scan UnicodeData udata = {} range_data = [] open(ARGV[0]) do |f| while l = f.gets l.chomp! code, charname, gencat, ccclass, bidicat,decomp, dec, digit, num, mirror, uni1_0, comment, upcase, lowcase, titlecase = l.split(";", 15); code = code.hex ccclass = ccclass.to_i canon, compat = hex2str(decomp) upcase = hex_or_nil(upcase) lowcase = hex_or_nil(lowcase) titlecase = hex_or_nil(titlecase) udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase, gencat] if charname =~ /^<(.*, (First|Last))>$/ charname = $1.upcase.gsub(/,? /, '_') range_data << [charname, code] end end end ## scan EastAsianWidth ea_width = {} open(ARGV[3]) do |f| while l = f.gets l.chomp! next if l =~ /^\#/ || l =~ /^$/ l =~ /^(.*)\s+#\s*(.*)$/ l = $1 comment = $2 code,width = l.split(/;/) if code =~ /\.\./ start_code, end_code = code.split('..') start_code = start_code.hex end_code = end_code.hex (start_code..end_code).each do |code| ea_width[code] = width end next end code = code.hex ea_width[code] = width end end print HEAD range_data.each do |charname, code| printf("#define %s\t(0x%04x)\n", charname, code) end print HEAD1 udata.sort.each do |code, data| ccclass, canon, compat, upcase, lowcase, titlecase, gencat = data ## Exclusions ex = 0 if exclusion[code] ## Script-specifics or Post Composition Version ex = 1 elsif canon =~ /^.$/ ## Singltons ex = 2 elsif !canon.nil? starter = canon.unpack("U*")[0] if udata[starter][0] != 0 ## Non-stater decompositions ex = 3 end end ## Special Casing if casing[code] lowcase = casing[code][0] if casing[code][0] titlecase = casing[code][1] if casing[code][1] upcase = casing[code][2] if casing[code][2] end width = 'N' if ea_width[code] width = ea_width[code] end printf(" { 0x%04x, %s, %s, %s, %s, %s, %d, %d, c_%s, w_%s }, \n", code, printstr(canon), printstr(compat), printstr(upcase), printstr(lowcase), printstr(titlecase), ccclass, ex, gencat, width) end printf(" { -1, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0 }\n") print TAIL