#!/bin/csh #This script successfully makes a first cut arff file. #Simplifications made: # 1. All records which don't have 161 attributes are not used. # 2. All lines containing " are not used # 3. >x is replaced by x # 4. "%" has been replaced by "pcnt" #Notes: #1. It appears that the solaris implementation of "cut" silently fails on # lines longer than 1024 characters. The first line of bact94.csv # is about 1060 characters. #2. arfftocsv dies on attribute values containing the characer "%" tail +2 /public/subjects/cs432/data/other/pakdd2000/bact94.csv |\ tr -d "[\015]" |\ fgrep -v '"' |\ sed -e 's/%/pcnt/g'|\ sed -e 's/,>32/,32/g'|\ sed -e 's/,>16/,16/g'|\ sed -e 's/,>8/,8/g'|\ sed -e 's/,>4/,4/g'|\ awk -f extract-full-line.awk> junk.1 cat new-head.txt junk.1 > junk.2 csvtoarff junk.2> bact94-ver-1.arff exit #some useful commands cut -d"," -f67 bact94.csv | egrep -e '(<|>)'