Preparing the input
I got from
Sanmi Koyejo , a graduate student in University of Austin, Texas, a Python script for converting KDD Yahoo! Cup dataset into GraphLab format. Thanks so much!
It may be preferable to the Matlab script, since it seems that for some users the Matlab script goes out of memory.
I have attached the python script for reading the kdd dataset. The package requires numpy so it can use the file writing method.
Additional information about the conversion procedure is kindly supplied by Yoyo
here.
NOTE: For running the resulting files in Graphlab, you will need to have access to a 64 bit machine. (32 bit machine can not load this dataset in its current form).
'''
Created on Apr 16, 2011
Read KDD cup data (Low Memory) to store in format suitable for graphlab pmf
Requires numpy, uses ndarray.tofile() to write the binary file
Module uses (A LOT) less memory than readKddData.py by reading and writing one user at a time
The tradeoff is that the max user id, item id, days and number of ratings must be known beforehand
This is because pmf expects this input in the first line of the File
Known Issue: Number of test (-f3) items is 624959, although we hard-code 624961.
This restriction comes from a bug(?) in pmf (to be fixed soon)
Ignore this warning if this your only issue
usage: python readKddLM.py --help
python readKddLM.py -i trainIdx.txt -o kddcup -f 1
python readKddLM.py -i validationIdx.txt -o kddcupe -f 2
python readKddLM.py -i testIdx.txt -o kddcupt -f 3
@author: Sanmi Koyejo; sanmi.k@mail.utexas.edu
'''
from optparse import OptionParser
from numpy import array, dtype, amax, maximum, zeros, int_
def readLine(fileHandle, splitter='|'):
''' read single line'''
line = fileHandle.readline()
if not line: #EOF
return line # return null and let caller handle it
return line.rstrip().split(splitter) # split the line and remove newline character
def readChunk(fileHandle, chunkSize, splitter='\t'):
'''read a pre-specified chunksize'''
for _ in range(chunkSize):
line = fileHandle.readline()
if not line: #EOF
break
yield line.rstrip().split(splitter)
def readOneUser(fileHandle, testFlag=False, verbose=True):
''' reads data for one user and returns rating Matrix'''
while 1:
line = readLine(fileHandle)
if not line: break # EOF
assert(len(line)==2)
userID = float(line[0])
nRatings = int(line[1])
rateMat = zeros( (nRatings, 4), dtype=dtype('f4'))
rateMat[:,0] = userID+1 # user ID
for count, line in enumerate(readChunk(fileHandle, nRatings)):
# note allow last user to break nratings constraint. All other users should satisfy this
rateMat[count, 1] = float(line[0])+1 # item ID
if testFlag:
assert(len(line)==3) # error checking
rateMat[count, 2] = float(line[1]) # day
rateMat[count, 3] = 1.0 # rating
else:
assert(len(line)==4)
rateMat[count, 2] = float(line[2]) # day
rateMat[count, 3] = float(line[1]) # rating
if verbose and nRatings != count+1:
'''User had a different number of items than expected
will only work for last user, any difference for other users will trigger assert errors'''
print "Warning: Expected", nRatings, "ratings from user; id:", int(userID), ", read", count+1, "ratings."
rateMat = rateMat[:count+1,:]
yield rateMat
def KddDataParser(infile, outfile, size, testFlag, verbose):
''' read data for each user and write to binary format'''
# setup storage for max user, item, nratings
readLen = 0
readSize = zeros(3, dtype=dtype('i4'))
# open reader and writer file handles
if verbose: print "opening input file", infile
readhandle = open(infile, 'rb')
if verbose: print "opening output file", outfile
writehandle = open(outfile, 'wb')
# write the size information
size.tofile(writehandle)
# read for each user
for count, rateMat in enumerate(readOneUser(readhandle, testFlag, verbose)):
readSize = maximum(readSize, int_(amax(rateMat[:,:3], axis=0)) ) # max user, max item, max time
readLen += rateMat.shape[0]
rateMat[:,1]+=float(size[0]) # itemID = itemID+maxUser
rateMat.tofile(writehandle)
if verbose:
if count%50000 == 0: print 'read', rateMat.shape[0], 'ratings from user', int(rateMat[0,0])-1
# close reader and writer file handles
readhandle.close()
writehandle.close()
if verbose: print "data conversion completed"
return readSize, readLen
def main():
usage = "usage: %prog [options] arg"
parser = OptionParser(usage)
parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True)
parser.add_option("-i", "--infile", dest="infile",
help="input file name", default="smallTrain.txt") # fixme
parser.add_option("-o", "--outfile", dest="outfile",
help="output file name", default="kddcupLM")
parser.add_option("-f", "--filetype", dest="filetype", type='int',
help="training=1, validation=2, test=3", default=1)
parser.add_option("-u", "--nuser", dest="nuser",
help="max ID of users, if not set, defaults to expected KDD size")
parser.add_option("-m", "--nitem", dest="nitem",
help="max ID of items, if not set, defaults to expected KDD size")
parser.add_option("-t", "--ntime", dest="ntime",
help="max number of days, if not set, defaults to expected KDD size")
parser.add_option("-r", "--nrate", dest="nrate",
help="number of ratings, if not set, defaults to expected KDD size")
(options, args) = parser.parse_args()
# setup nUser/nitem/nTime defaults based on train/valid/test
nuser = 1000990 if options.nuser== None else options.nuser
nitem = 624961 if options.nitem== None else options.nitem
'''TODO: once pmf is modified, change definition of nitem
nitem (train, valid)== 624961
nitem(test)== 624959
Should not affect results'''
if options.filetype==1:
ntime = 6645 if options.ntime== None else options.ntime
nrate = 252800275 if options.nrate== None else options.nrate
istest= False
elif options.filetype==2:
ntime = 6645 if options.ntime== None else options.ntime
nrate = 4003960 if options.nrate== None else options.nrate
istest= False
elif options.filetype==3:
ntime = 6649 if options.ntime== None else options.ntime
nrate = 6005940 if options.nrate== None else options.nrate
istest = True
else:
errorStr = "--filetype input: "+`options.filetype`+". Allowed values are 1, 2, 3"
raise LookupError(errorStr)
size = array([nuser, nitem, ntime, nrate], dtype=dtype('i4'))
[nUser, nItem, nDays], nRate = KddDataParser(options.infile, options.outfile, size, istest, options.verbose)
print 'input nuser:', nuser, ', max ID of user read:', nUser
print 'input nitem:', nitem, ', max ID of item read:', nItem
print 'input ndays:', ntime, ', max day read', nDays
print 'input nrate:', nrate, ', Number of ratings read:', nRate
if (nuser!=nUser) or (nitem!=nItem) or (ntime!=nDays) or (nrate!=nRate):
print "Warning: input parameters differ from output parameters,",
print "graphlab pmf may not run correctly !!!"
if __name__ == '__main__':
main()
Sanity check 0: The downloaded file size from Yahoo! should be:
-rw-r--r-- 1 bickson users 134407201 2011-01-24 07:46 testIdx1.txt
-rw-r--r-- 1 bickson users 5967164350 2011-01-24 10:23 trainIdx1.txt
-rw-r--r-- 1 bickson users 104193447 2011-01-24 10:25 validationIdx1.txt
Sanity check 1: The output file size should be:
$ ls -l kddcup*
-rw-r–r– 1 sil sil 4044804416 2011-06-27 18:18 kddcup
-rw-r–r– 1 sil sil 64063376 2011-06-28 12:51 kddcupe
-rw-r–r– 1 sil sil 96095056 2011-06-28 16:22 kddcupt
(Thanks Yoyo!)
Sanity check 2: you can use the md5sum command to verify creation of inputs.
You should get the following numbers:
<34|0>bickson@bigbro6:~/newgraphlab/graphlabapi/debug/demoapps/pmf$ md5sum kddcupe
aa76bb1d0e6e897e270ed65d021ed1d8 kddcupe
<35|0>bickson@bigbro6:~/newgraphlab/graphlabapi/debug/demoapps/pmf$ md5sum kddcupt
917599ce7f715890a2705dc04851ac12 kddcupt
<36|0>bickson@bigbro6:~/newgraphlab/graphlabapi/debug/demoapps/pmf$ md5sum kddcup
345b168a208757b3098c6674b2fb653a kddcup
If you got different output, please check carefully that the command line arguments used are as instructed.
Sanity check 3: When running the third script, you should see the output:
> data conversion completed
> input nuser: 1000990 , max ID of user read: 1000990
> input nitem: 624961 , max ID of item read: 624959
> input ndays: 6649 , max day read 6649
> input nrate: 6005940 , Number of ratings read: 6005940
> Warning: input parameters differ from output parameters, graphlab pmf may
> not run correctly !!!