1
+ #!/usr/bin/env python
2
+
3
+ #
4
+ # Sort large text files in a minimum amount of memory
5
+ #
6
+ import os
7
+ import sys
8
+ import argparse
9
+
10
+ class FileSplitter (object ):
11
+ BLOCK_FILENAME_FORMAT = 'block_{0}.dat'
12
+
13
+ def __init__ (self , filename ):
14
+ self .filename = filename
15
+ self .block_filenames = []
16
+
17
+ def write_block (self , data , block_number ):
18
+ filename = self .BLOCK_FILENAME_FORMAT .format (block_number )
19
+ file = open (filename , 'w' )
20
+ file .write (data )
21
+ file .close ()
22
+ self .block_filenames .append (filename )
23
+
24
+ def get_block_filenames (self ):
25
+ return self .block_filenames
26
+
27
+ def split (self , block_size , sort_key = None ):
28
+ file = open (self .filename , 'r' )
29
+ i = 0
30
+
31
+ while True :
32
+ lines = file .readlines (block_size )
33
+
34
+ if lines == []:
35
+ break
36
+
37
+ if sort_key is None :
38
+ lines .sort ()
39
+ else :
40
+ lines .sort (key = sort_key )
41
+
42
+ self .write_block ('' .join (lines ), i )
43
+ i += 1
44
+
45
+ def cleanup (self ):
46
+ map (lambda f : os .remove (f ), self .block_filenames )
47
+
48
+
49
+ class NWayMerge (object ):
50
+ def select (self , choices ):
51
+ min_index = - 1
52
+ min_str = None
53
+
54
+ for i in range (len (choices )):
55
+ if min_str is None or choices [i ] < min_str :
56
+ min_index = i
57
+
58
+ return min_index
59
+
60
+
61
+ class FilesArray (object ):
62
+ def __init__ (self , files ):
63
+ self .files = files
64
+ self .empty = set ()
65
+ self .num_buffers = len (files )
66
+ self .buffers = {i : None for i in range (self .num_buffers )}
67
+
68
+ def get_dict (self ):
69
+ return {i : self .buffers [i ] for i in range (self .num_buffers ) if i not in self .empty }
70
+
71
+ def refresh (self ):
72
+ for i in range (self .num_buffers ):
73
+ if self .buffers [i ] is None and i not in self .empty :
74
+ self .buffers [i ] = self .files [i ].readline ()
75
+
76
+ if self .buffers [i ] == '' :
77
+ self .empty .add (i )
78
+
79
+ if len (self .empty ) == self .num_buffers :
80
+ return False
81
+
82
+ return True
83
+
84
+ def unshift (self , index ):
85
+ value = self .buffers [index ]
86
+ self .buffers [index ] = None
87
+
88
+ return value
89
+
90
+
91
+ class FileMerger (object ):
92
+ def __init__ (self , merge_strategy ):
93
+ self .merge_strategy = merge_strategy
94
+
95
+ def merge (self , filenames , outfilename , buffer_size ):
96
+ outfile = open (outfilename , 'w' , buffer_size )
97
+ buffers = FilesArray (self .get_file_handles (filenames , buffer_size ))
98
+
99
+ while buffers .refresh ():
100
+ min_index = self .merge_strategy .select (buffers .get_dict ())
101
+ outfile .write (buffers .unshift (min_index ))
102
+
103
+ def get_file_handles (self , filenames , buffer_size ):
104
+ files = {}
105
+
106
+ for i in range (len (filenames )):
107
+ files [i ] = open (filenames [i ], 'r' , buffer_size )
108
+
109
+ return files
110
+
111
+
112
+
113
+ class ExternalSort (object ):
114
+ def __init__ (self , block_size ):
115
+ self .block_size = block_size
116
+
117
+ def sort (self , filename , sort_key = None ):
118
+ num_blocks = self .get_number_blocks (filename , self .block_size )
119
+ splitter = FileSplitter (filename )
120
+ splitter .split (self .block_size , sort_key )
121
+
122
+ merger = FileMerger (NWayMerge ())
123
+ buffer_size = self .block_size / (num_blocks + 1 )
124
+ merger .merge (splitter .get_block_filenames (), filename + '.out' , buffer_size )
125
+
126
+ splitter .cleanup ()
127
+
128
+ def get_number_blocks (self , filename , block_size ):
129
+ return (os .stat (filename ).st_size / block_size ) + 1
130
+
131
+
132
+ def parse_memory (string ):
133
+ if string [- 1 ].lower () == 'k' :
134
+ return int (string [:- 1 ]) * 1024
135
+ elif string [- 1 ].lower () == 'm' :
136
+ return int (string [:- 1 ]) * 1024 * 1024
137
+ elif string [- 1 ].lower () == 'g' :
138
+ return int (string [:- 1 ]) * 1024 * 1024 * 1024
139
+ else :
140
+ return int (string )
141
+
142
+
143
+
144
+ def main ():
145
+ parser = argparse .ArgumentParser ()
146
+ parser .add_argument ('-m' ,
147
+ '--mem' ,
148
+ help = 'amount of memory to use for sorting' ,
149
+ default = '100M' )
150
+ parser .add_argument ('filename' ,
151
+ metavar = '<filename>' ,
152
+ nargs = 1 ,
153
+ help = 'name of file to sort' )
154
+ args = parser .parse_args ()
155
+
156
+ sorter = ExternalSort (parse_memory (args .mem ))
157
+ sorter .sort (args .filename [0 ])
158
+
159
+
160
+ if __name__ == '__main__' :
161
+ main ()
0 commit comments