From 27a98170c7ca00faf947545001ad07d6de63fdf2 Mon Sep 17 00:00:00 2001 From: ioane margiani Date: Fri, 12 Jun 2020 19:56:19 +0400 Subject: [PATCH 1/5] Added lempel-ziv compression algorithm implementation --- compression/lempel_ziv.py | 115 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 compression/lempel_ziv.py diff --git a/compression/lempel_ziv.py b/compression/lempel_ziv.py new file mode 100644 index 000000000000..0670c5b6c058 --- /dev/null +++ b/compression/lempel_ziv.py @@ -0,0 +1,115 @@ +""" + One of the several implementations of Lempel–Ziv–Welch compression algorithm + https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch +""" + +import os +import sys +import math + + +def read_file_binary(file): + """ + Reads given file as bytes and returns them as a long string + """ + result = "" + try: + with open(file, "rb") as binary_file: + data = binary_file.read() + for dat in data: + curr_byte = "{0:08b}".format(dat) + result += curr_byte + return result + except IOError: + print("File not accessible") + sys.exit() + + +def add_key_to_lexicon(lexicon, curr_string, index, last_match_id): + """ + Adds new strings (curr_string + "0", curr_string + "1") to the lexicon + """ + lexicon.pop(curr_string) + lexicon[curr_string + "0"] = last_match_id + + if math.log2(index).is_integer(): + for curr_key in lexicon.keys(): + lexicon[curr_key] = "0" + lexicon[curr_key] + + lexicon[curr_string + "1"] = bin(index)[2:] + + +def compress_data(data_bits): + """ + Compresses given data_bits using Lempel–Ziv–Welch compression algorithm + and returns the result as a string + """ + lexicon = {"0": "0", "1": "1"} + result, curr_string = "", "" + index = len(lexicon) + + for i in range(len(data_bits)): + curr_string += data_bits[i] + if curr_string not in lexicon.keys(): + continue + + last_match_id = lexicon[curr_string] + result += last_match_id + add_key_to_lexicon(lexicon, curr_string, index, last_match_id) + index += 1 + curr_string = "" + + while curr_string != "" and curr_string not in lexicon.keys(): + curr_string += "0" + + if curr_string != "": + last_match_id = lexicon[curr_string] + result += last_match_id + + return result + + +def add_file_length(source, compressed): + """ + Adds given file's length in front (using Elias gamma coding) of the compressed string + """ + file_length = os.path.getsize(source) + file_length_binary = bin(file_length)[2:] + length_length = len(file_length_binary) + + return "0" * (length_length - 1) + file_length_binary + compressed + + +def write_file_binary(file, to_write): + """ + Writes given to_write string (should only consist of 0's and 1's) as bytes in the file + """ + byte_length = 8 + try: + with open(file, "wb") as opened_file: + result_byte_array = [to_write[i:i + byte_length] for i in range(0, len(to_write), byte_length)] + + if len(result_byte_array[-1]) % byte_length == 0: + result_byte_array.append("10000000") + else: + result_byte_array[-1] += "1" + "0" * (byte_length - len(result_byte_array[-1]) - 1) + + for elem in result_byte_array: + opened_file.write(int(elem, 2).to_bytes(1, byteorder="big")) + except IOError: + print("File not accessible") + sys.exit() + + +def compress(source, destination): + """ + Reads source file, compresses it and writes the compressed result in destination file + """ + data_bits = read_file_binary(source) + compressed = compress_data(data_bits) + compressed = add_file_length(source, compressed) + write_file_binary(destination, compressed) + + +if __name__ == "__main__": + compress(sys.argv[1], sys.argv[2]) From 7a049b6d285f62328d1cdbb4e81fa17504a3b305 Mon Sep 17 00:00:00 2001 From: ioane margiani Date: Fri, 12 Jun 2020 20:00:02 +0400 Subject: [PATCH 2/5] Added lempel-ziv decompression algorithm implementation --- compression/lempel_ziv_decompress.py | 105 +++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 compression/lempel_ziv_decompress.py diff --git a/compression/lempel_ziv_decompress.py b/compression/lempel_ziv_decompress.py new file mode 100644 index 000000000000..b561b4880804 --- /dev/null +++ b/compression/lempel_ziv_decompress.py @@ -0,0 +1,105 @@ +""" + One of the several implementations of Lempel–Ziv–Welch decompression algorithm + https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch +""" + +import sys +import math + + +def read_file_binary(file): + """ + Reads given file as bytes and returns them as a long string + """ + result = "" + try: + with open(file, "rb") as binary_file: + data = binary_file.read() + for dat in data: + curr_byte = "{0:08b}".format(dat) + result += curr_byte + return result + except IOError: + print("File not accessible") + sys.exit() + + +def decompress_data(data_bits): + """ + Decompresses given data_bits using Lempel–Ziv–Welch compression algorithm + and returns the result as a string + """ + lexicon = {"0": "0", "1": "1"} + result, curr_string = "", "" + index = len(lexicon) + + for i in range(len(data_bits)): + curr_string += data_bits[i] + if curr_string not in lexicon.keys(): + continue + + last_match_id = lexicon[curr_string] + result += last_match_id + lexicon[curr_string] = last_match_id + "0" + + if math.log2(index).is_integer(): + newLex = {} + for curr_key in list(lexicon.keys()): + newLex["0" + curr_key] = lexicon.pop(curr_key) + lexicon = newLex + + lexicon[bin(index)[2:]] = last_match_id + "1" + index += 1 + curr_string = "" + return result + + +def write_file_binary(file, to_write): + """ + Writes given to_write string (should only consist of 0's and 1's) as bytes in the file + """ + byte_length = 8 + try: + with open(file, "wb") as opened_file: + result_byte_array = [to_write[i:i + byte_length] for i in range(0, len(to_write), byte_length)] + + if len(result_byte_array[-1]) % byte_length == 0: + result_byte_array.append("10000000") + else: + result_byte_array[-1] += "1" + "0" * (byte_length - len(result_byte_array[-1]) - 1) + + for elem in result_byte_array[:-1]: + opened_file.write(int(elem, 2).to_bytes(1, byteorder="big")) + except IOError: + print("File not accessible") + sys.exit() + + +def remove_prefix(data_bits): + """ + Removes size prefix, that compressed file should have + Returns the result + """ + counter = 0 + for letter in data_bits: + if letter == "1": + break + counter += 1 + + data_bits = data_bits[counter:] + data_bits = data_bits[counter + 1:] + return data_bits + + +def compress(source, destination): + """ + Reads source file, decompresses it and writes the result in destination file + """ + data_bits = read_file_binary(source) + data_bits = remove_prefix(data_bits) + decompressed = decompress_data(data_bits) + write_file_binary(destination, decompressed) + + +if __name__ == "__main__": + compress(sys.argv[1], sys.argv[2]) From 292364e816ab345cdba1bfb82350dca82f0a7cb1 Mon Sep 17 00:00:00 2001 From: ioane margiani Date: Fri, 12 Jun 2020 20:19:09 +0400 Subject: [PATCH 3/5] Reformatted lempel-ziv compress/decompress files using black --- compression/lempel_ziv.py | 9 +++++++-- compression/lempel_ziv_decompress.py | 11 ++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/compression/lempel_ziv.py b/compression/lempel_ziv.py index 0670c5b6c058..27f22f7e2a69 100644 --- a/compression/lempel_ziv.py +++ b/compression/lempel_ziv.py @@ -87,12 +87,17 @@ def write_file_binary(file, to_write): byte_length = 8 try: with open(file, "wb") as opened_file: - result_byte_array = [to_write[i:i + byte_length] for i in range(0, len(to_write), byte_length)] + result_byte_array = [ + to_write[i : i + byte_length] + for i in range(0, len(to_write), byte_length) + ] if len(result_byte_array[-1]) % byte_length == 0: result_byte_array.append("10000000") else: - result_byte_array[-1] += "1" + "0" * (byte_length - len(result_byte_array[-1]) - 1) + result_byte_array[-1] += "1" + "0" * ( + byte_length - len(result_byte_array[-1]) - 1 + ) for elem in result_byte_array: opened_file.write(int(elem, 2).to_bytes(1, byteorder="big")) diff --git a/compression/lempel_ziv_decompress.py b/compression/lempel_ziv_decompress.py index b561b4880804..8118078320b2 100644 --- a/compression/lempel_ziv_decompress.py +++ b/compression/lempel_ziv_decompress.py @@ -61,12 +61,17 @@ def write_file_binary(file, to_write): byte_length = 8 try: with open(file, "wb") as opened_file: - result_byte_array = [to_write[i:i + byte_length] for i in range(0, len(to_write), byte_length)] + result_byte_array = [ + to_write[i : i + byte_length] + for i in range(0, len(to_write), byte_length) + ] if len(result_byte_array[-1]) % byte_length == 0: result_byte_array.append("10000000") else: - result_byte_array[-1] += "1" + "0" * (byte_length - len(result_byte_array[-1]) - 1) + result_byte_array[-1] += "1" + "0" * ( + byte_length - len(result_byte_array[-1]) - 1 + ) for elem in result_byte_array[:-1]: opened_file.write(int(elem, 2).to_bytes(1, byteorder="big")) @@ -87,7 +92,7 @@ def remove_prefix(data_bits): counter += 1 data_bits = data_bits[counter:] - data_bits = data_bits[counter + 1:] + data_bits = data_bits[counter + 1 :] return data_bits From 9d44acc682bbda82ea1192cc707b58afd544019c Mon Sep 17 00:00:00 2001 From: ioane margiani Date: Wed, 17 Jun 2020 22:24:42 +0400 Subject: [PATCH 4/5] Added type hints and some other modifications (Doctests coming up) --- compression/lempel_ziv.py | 40 +++++++++++++++------------- compression/lempel_ziv_decompress.py | 30 ++++++++++----------- 2 files changed, 36 insertions(+), 34 deletions(-) diff --git a/compression/lempel_ziv.py b/compression/lempel_ziv.py index 27f22f7e2a69..34fd0b2cd672 100644 --- a/compression/lempel_ziv.py +++ b/compression/lempel_ziv.py @@ -3,29 +3,31 @@ https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch """ +import math import os import sys -import math -def read_file_binary(file): +def read_file_binary(file_path: str) -> str: """ Reads given file as bytes and returns them as a long string """ result = "" try: - with open(file, "rb") as binary_file: + with open(file_path, "rb") as binary_file: data = binary_file.read() - for dat in data: - curr_byte = "{0:08b}".format(dat) - result += curr_byte + for dat in data: + curr_byte = "{0:08b}".format(dat) + result += curr_byte return result except IOError: print("File not accessible") sys.exit() -def add_key_to_lexicon(lexicon, curr_string, index, last_match_id): +def add_key_to_lexicon( + lexicon: dict, curr_string: str, index: int, last_match_id: int +) -> None: """ Adds new strings (curr_string + "0", curr_string + "1") to the lexicon """ @@ -33,13 +35,13 @@ def add_key_to_lexicon(lexicon, curr_string, index, last_match_id): lexicon[curr_string + "0"] = last_match_id if math.log2(index).is_integer(): - for curr_key in lexicon.keys(): + for curr_key in lexicon: lexicon[curr_key] = "0" + lexicon[curr_key] lexicon[curr_string + "1"] = bin(index)[2:] -def compress_data(data_bits): +def compress_data(data_bits: str) -> str: """ Compresses given data_bits using Lempel–Ziv–Welch compression algorithm and returns the result as a string @@ -50,7 +52,7 @@ def compress_data(data_bits): for i in range(len(data_bits)): curr_string += data_bits[i] - if curr_string not in lexicon.keys(): + if curr_string not in lexicon: continue last_match_id = lexicon[curr_string] @@ -59,7 +61,7 @@ def compress_data(data_bits): index += 1 curr_string = "" - while curr_string != "" and curr_string not in lexicon.keys(): + while curr_string != "" and curr_string not in lexicon: curr_string += "0" if curr_string != "": @@ -69,24 +71,24 @@ def compress_data(data_bits): return result -def add_file_length(source, compressed): +def add_file_length(source_path: str, compressed: str) -> str: """ Adds given file's length in front (using Elias gamma coding) of the compressed string """ - file_length = os.path.getsize(source) + file_length = os.path.getsize(source_path) file_length_binary = bin(file_length)[2:] length_length = len(file_length_binary) return "0" * (length_length - 1) + file_length_binary + compressed -def write_file_binary(file, to_write): +def write_file_binary(file_path: str, to_write: str) -> None: """ Writes given to_write string (should only consist of 0's and 1's) as bytes in the file """ byte_length = 8 try: - with open(file, "wb") as opened_file: + with open(file_path, "wb") as opened_file: result_byte_array = [ to_write[i : i + byte_length] for i in range(0, len(to_write), byte_length) @@ -106,14 +108,14 @@ def write_file_binary(file, to_write): sys.exit() -def compress(source, destination): +def compress(source_path, destination_path: str) -> None: """ Reads source file, compresses it and writes the compressed result in destination file """ - data_bits = read_file_binary(source) + data_bits = read_file_binary(source_path) compressed = compress_data(data_bits) - compressed = add_file_length(source, compressed) - write_file_binary(destination, compressed) + compressed = add_file_length(source_path, compressed) + write_file_binary(destination_path, compressed) if __name__ == "__main__": diff --git a/compression/lempel_ziv_decompress.py b/compression/lempel_ziv_decompress.py index 8118078320b2..4ba66c80556c 100644 --- a/compression/lempel_ziv_decompress.py +++ b/compression/lempel_ziv_decompress.py @@ -3,28 +3,28 @@ https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch """ -import sys import math +import sys -def read_file_binary(file): +def read_file_binary(file_path: str) -> str: """ Reads given file as bytes and returns them as a long string """ result = "" try: - with open(file, "rb") as binary_file: + with open(file_path, "rb") as binary_file: data = binary_file.read() - for dat in data: - curr_byte = "{0:08b}".format(dat) - result += curr_byte + for dat in data: + curr_byte = "{0:08b}".format(dat) + result += curr_byte return result except IOError: print("File not accessible") sys.exit() -def decompress_data(data_bits): +def decompress_data(data_bits: str) -> str: """ Decompresses given data_bits using Lempel–Ziv–Welch compression algorithm and returns the result as a string @@ -35,7 +35,7 @@ def decompress_data(data_bits): for i in range(len(data_bits)): curr_string += data_bits[i] - if curr_string not in lexicon.keys(): + if curr_string not in lexicon: continue last_match_id = lexicon[curr_string] @@ -44,7 +44,7 @@ def decompress_data(data_bits): if math.log2(index).is_integer(): newLex = {} - for curr_key in list(lexicon.keys()): + for curr_key in list(lexicon): newLex["0" + curr_key] = lexicon.pop(curr_key) lexicon = newLex @@ -54,13 +54,13 @@ def decompress_data(data_bits): return result -def write_file_binary(file, to_write): +def write_file_binary(file_path: str, to_write: str) -> None: """ Writes given to_write string (should only consist of 0's and 1's) as bytes in the file """ byte_length = 8 try: - with open(file, "wb") as opened_file: + with open(file_path, "wb") as opened_file: result_byte_array = [ to_write[i : i + byte_length] for i in range(0, len(to_write), byte_length) @@ -80,7 +80,7 @@ def write_file_binary(file, to_write): sys.exit() -def remove_prefix(data_bits): +def remove_prefix(data_bits: str) -> str: """ Removes size prefix, that compressed file should have Returns the result @@ -96,14 +96,14 @@ def remove_prefix(data_bits): return data_bits -def compress(source, destination): +def compress(source_path: str, destination_path: str) -> None: """ Reads source file, decompresses it and writes the result in destination file """ - data_bits = read_file_binary(source) + data_bits = read_file_binary(source_path) data_bits = remove_prefix(data_bits) decompressed = decompress_data(data_bits) - write_file_binary(destination, decompressed) + write_file_binary(destination_path, decompressed) if __name__ == "__main__": From 1b837ce7625d2f7d4e4abf2e3970031db63f567d Mon Sep 17 00:00:00 2001 From: ioane margiani Date: Wed, 17 Jun 2020 22:33:37 +0400 Subject: [PATCH 5/5] Shortened several lines to comply with the standards --- compression/lempel_ziv.py | 9 ++++++--- compression/lempel_ziv_decompress.py | 3 ++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/compression/lempel_ziv.py b/compression/lempel_ziv.py index 34fd0b2cd672..3ac8573c43d8 100644 --- a/compression/lempel_ziv.py +++ b/compression/lempel_ziv.py @@ -73,7 +73,8 @@ def compress_data(data_bits: str) -> str: def add_file_length(source_path: str, compressed: str) -> str: """ - Adds given file's length in front (using Elias gamma coding) of the compressed string + Adds given file's length in front (using Elias gamma coding) of the compressed + string """ file_length = os.path.getsize(source_path) file_length_binary = bin(file_length)[2:] @@ -84,7 +85,8 @@ def add_file_length(source_path: str, compressed: str) -> str: def write_file_binary(file_path: str, to_write: str) -> None: """ - Writes given to_write string (should only consist of 0's and 1's) as bytes in the file + Writes given to_write string (should only consist of 0's and 1's) as bytes in the + file """ byte_length = 8 try: @@ -110,7 +112,8 @@ def write_file_binary(file_path: str, to_write: str) -> None: def compress(source_path, destination_path: str) -> None: """ - Reads source file, compresses it and writes the compressed result in destination file + Reads source file, compresses it and writes the compressed result in destination + file """ data_bits = read_file_binary(source_path) compressed = compress_data(data_bits) diff --git a/compression/lempel_ziv_decompress.py b/compression/lempel_ziv_decompress.py index 4ba66c80556c..05c26740bf62 100644 --- a/compression/lempel_ziv_decompress.py +++ b/compression/lempel_ziv_decompress.py @@ -56,7 +56,8 @@ def decompress_data(data_bits: str) -> str: def write_file_binary(file_path: str, to_write: str) -> None: """ - Writes given to_write string (should only consist of 0's and 1's) as bytes in the file + Writes given to_write string (should only consist of 0's and 1's) as bytes in the + file """ byte_length = 8 try: