DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

Snippets has posted 5883 posts at DZone. View Full User Profile

Processing Large CSV Files

06.10.2008
| 6211 views |
  • submit to reddit
        
  # this method allows fast CSV processing, it reads file_name, generates a new
  # CSV file and inserts all data in a mysql table using LOAD DATA INFILE

    # you can use it like this:

    load_csv_data(PRELOAD_DIR+'socios.csv', 'users') do |csv, thing_id, row|
      csv << [thing_id,
        row['NOMBRE'].to_s.to_permalink+'.'+row['APELLIDOS'].to_s.to_permalink, #  login
        row['E_MAIL'], #  email
        row['SEXO'],  #  gender
        (row['NO_DATOS'] == 'S' || row['NO_DATOS'] == 'VERDADERO' ? 1 : 0 ), #  no_datos
      ]
    end


  def load_csv_data(file_name, table_name)
    STDERR.print("\nParsing '#{file_name}' to insert data in '#{table_name}'\n")

    n_lines = (%x[wc -l #{file_name}]).split(' ')[0].to_f
    time_start = Time.now
    begin
      require 'fastercsv'
      count = 0.0

      csv_string = FasterCSV.open("data.csv", "w", { :col_sep => ";", :force_quotes => true })  do |csv|
      FasterCSV.open("#{file_name}", { :col_sep => "\t", :headers => :first_row }).each do |row|
        thing_id = row[0].to_i
        if thing_id && thing_id > 0
          
          yield csv, thing_id, row

          percentage = ((count += 1) * 100.0)/n_lines
          STDERR.print "%.0f%%..." % percentage if (percentage.modulo(2.0) < 0.001)
        end
        end
      end
    rescue EOFError
      f.close
    end
    total_lines = (%x[wc -l data.csv]).split(' ')[0].to_i
    STDERR.print("\nInserting #{total_lines} lines in table '#{table_name}'\n")
    ActiveRecord::Base.connection.execute("LOAD DATA INFILE '#{File.join(File.dirname(__FILE__), '..', '..', 'data.csv')}' IGNORE INTO TABLE #{table_name} FIELDS TERMINATED BY ';' ENCLOSED BY '\"';")
    warn("Finished importing '#{file_name}'.\nLines: #{n_lines.to_i.to_s} (#{(n_lines.to_i - total_lines.to_i).to_s} lost)\nTime: %.2f minutes\n" % ((Time.now - time_start)/60))
  end