Feb 01, 2016

Using CoffeeScript to build small UNIX-friendly tools

During the last weeks I've started intensively working with NodeJS on collecting and processing api data. In most cases api data is received in JSON but needs be to transformed to csv to be processable in R.

Why CoffeeScript?

So why using NodeJS for that Job?

And why CoffeeScript?

Pipes

The power of unix piping approach get's clear on processing large files without the need of extensive computing power.

When using large JSON and CSV files, this approach becomes very attractive.

Example: JSON to CSV

To get the idea:

  $ cat large.json | coffee json2csv.coffee > large.csv

Resume

Neither does CoffeeScript replace well established script languages like Ruby or Python nor wouldn't other script language be capable of doing this job. But keeping in mind to build small tools to receive and process data on lower level is a reason to try CoffeeScript out as you will see in following examples.

#!/bin/env/coffee

json2csv = require('json2csv')
JSONStream = require('JSONStream')
expandHomeDir = require('expand-home-dir')
_ = require('lodash')
# { camelize, underscored } = require('underscore.string')
fs = require('fs')

options = require('yargs')
  .help('h')
  .alias('h', 'help')
  .describe('header', 'display csv header [true,false,only]')
  .default('header', 'true')
  .describe('fields', 'comma sperated fields (e.g. `id,name,owner.login` or `*` for all)')
  .default('fields', '*')
  .describe('file', 'path to json file')
  .default('file', '')
  .describe('flatten', 'flatten JSON, so user.login.name is possible as column')
  .default('flatten', 'false')
  .describe('delimiter', 'csv delimiter')
  .default('delimiter', ',')
  .describe('quote', 'csv quote')
  .default('quote', '"')
  .alias('quote', 'terminator')
  # .demand(['fields'])
  .describe('jsonPath', 'path of parsing (e.g. `items`)')
  .epilogue("""
    converts json data to csv
    Data can be given by a file or pipe
      1) cat data.json | coffee json_to_csv.coffee > data.csv
         or
      2) coffee json_to_csv.coffee --file=data.json > data.csv
  """)
  .argv

{ delimiter, fields, file, quote, jsonPath, header, flatten } = options

flatten = if flatten is 'false' then false else true

# fields = if fields is '*' then [] else options.fields.split(delimiter)

if file.trim()
  file = expandHomeDir(file)
  if not fs.lstatSync(file)
    console.error("File '#{file}' doesnt exists / isnt readable")
    process.exit(1)
  else
    stream = fs.createReadStream(file)
else
  # stream via stdin
  stream = process.stdin

parser = if jsonPath then JSONStream.parse(jsonPath.split(',')) else JSONStream.parse()

headerIsProcessed = if header is 'false' then true else false

processHeader = (data, header, json2csvOptions) ->
  if header is 'only'
    json2csvOptions.data = data[0] # we only need the first row, maybe faster?!
  else
    json2csvOptions.data = data
  json2csvOptions.hasCSVColumnTitle = true

  if fields isnt '*' and fields.trim().length > 0
    # specific fields
    json2csvOptions.fields = fields.split(delimiter)

  json2csv json2csvOptions, (err, csv) ->
    if header is 'only'
      console.log(csv.split('\n')[0])
      process.exit(0)
    else
      console.log(csv)


parser
  .on 'data', (data) ->
    # check if data is array, if not -> transform to array of length 1
    if data? and data.constructor isnt Array
      data = [data]

    json2csvOptions = {flatten}

    json2csvOptions.fields = fields.split(delimiter) if fields isnt '*'

    unless headerIsProcessed
      headerIsProcessed = true
      processHeader(data, header, json2csvOptions)
    else
      # print out rows
      json2csvOptions.data = data
      json2csvOptions.hasCSVColumnTitle = false
      json2csv json2csvOptions, (err, csv) ->
        console.log csv

stream
  .pipe(parser)