examples/auto_comments.py

# &#36816;&#34892;&#26041;&#24335;&#65306;python auto_comments.py --path 'path of file or folder'
# &#33050;&#26412;&#21151;&#33021;&#65306;&#20351;&#29992;QWen-7B-Chat&#20026;&#25552;&#20379;&#30340;&#20195;&#30721;&#25991;&#20214;&#33258;&#21160;&#29983;&#25104;&#27880;&#37322;&#12290;(&#35814;&#35265;auto_comments.md)


import argparse
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig

MaxLine = 50 # &#38480;&#21046;&#21333;&#27425;&#22788;&#29702;&#26368;&#22823;&#20195;&#30721;&#34892;&#25968;
SplitKey = ["\ndef "] # &#33258;&#23450;&#20041;&#30340;&#20999;&#20998;&#20195;&#30721;&#26631;&#35782;
CodeFileType = ["py"] # &#30446;&#21069;&#20165;&#27979;&#35797;&#36807;&#23545;python&#25991;&#20214;&#29983;&#25104;&#27880;&#37322;

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--path', type=str, default='Qwen-7B/eval/evaluate_ceval.py')
    parser.add_argument('--regenerate', action='store_true', default=False) #&#22914;&#26524;&#24050;&#32463;&#29983;&#25104;&#36807;&#27880;&#37322;&#65292;&#40664;&#35748;&#19981;&#20250;&#37325;&#26032;&#29983;&#25104;
    args = parser.parse_args()
    return args

class QWenChat():
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)

        # use bf16
        # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, bf16=True).eval()
        # use fp16
        # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, fp16=True).eval()
        # use cpu only
        # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="cpu", trust_remote_code=True).eval()
        # use auto mode, automatically select precision based on the device.
        self.model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True).eval()
        
        # Specify hyperparameters for generation
        self.model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
        self.history = None
        
    def chat(self, query, system = ""):

        # use history
        # response, history = self.model.chat(self.tokenizer, query, history=self.history)

        # &#40664;&#35748;&#19981;&#20351;&#29992;history
        response, history = self.model.chat(self.tokenizer, query, history=None)
        self.history = history

        return response
# &#29983;&#25104;&#27880;&#37322;
def gen_code_comments(context, model = None, **kwargs):
    prompt = "\n&#20026;&#20197;&#19978;&#20195;&#30721;&#29983;&#25104;&#32454;&#33268;&#30340;&#20013;&#25991;&#27880;&#37322;&#65292;&#27880;&#24847;&#20351;&#29992;&#21512;&#36866;&#30340;&#35821;&#27861;&#12290;&#35201;&#27714;&#24517;&#39035;&#22312;&#27599;&#20010;&#20989;&#25968;&#24320;&#22836;&#29983;&#25104;&#19968;&#27573;&#32479;&#19968;&#30340;&#20989;&#25968;&#21151;&#33021;&#27880;&#37322;&#12290;\n&#38500;&#20102;&#27880;&#37322;&#65292;&#35831;&#20445;&#35777;&#21407;&#22987;&#20195;&#30721;&#20869;&#23481;&#19981;&#21464;&#12290;&#19981;&#35201;&#36820;&#22238;&#38500;&#20102;&#27880;&#37322;&#21644;&#20195;&#30721;&#20197;&#22806;&#30340;&#20854;&#20313;&#20449;&#24687;&#65292;&#19981;&#35201;&#29983;&#25104;&#39069;&#22806;&#20195;&#30721;&#12290;\n"
    return model.chat(context + prompt)

def read_file(path):
    f = open(path, "r",encoding='utf-8')
    lines = f.readlines()
    return "".join(lines)

def write_file(path, context):
    with open(path,'w') as f:
        f.write(context)

# &#22914;&#26524;&#20195;&#30721;&#25991;&#20214;&#36807;&#38271;&#65292;&#21487;&#20197;&#31616;&#21333;&#25353;&#29031;&#26368;&#22823;&#34892;&#25968;&#20999;&#20998;&#20195;&#30721;
def split_context_by_maxline(text):
    lines = text.split("\n")
    lines_len = len(lines)
    res = []
    for i in range(MaxLine, lines_len, MaxLine):
        res.append("\n".join(lines[i-MaxLine:i]))

    if i < lines_len:
        res.append("\n".join(lines[i:]))
    return res

# &#22914;&#26524;&#20195;&#30721;&#25991;&#20214;&#36807;&#38271;&#65292;&#21487;&#20197;&#31616;&#21333;&#25353;&#29031;&#20989;&#25968;&#20999;&#20998;&#20195;&#30721;
def split_context_by_splitkey(text):
    blocks = text.split(SplitKey[0])
    return [blocks[0]] + [SplitKey[0]+x for x in blocks[1:]]

# merge&#21407;&#22987;&#20195;&#30721;&#21644;&#29983;&#25104;&#30340;&#27880;&#37322;&#65292;&#30446;&#30340;&#26159;&#20445;&#35777;&#21407;&#22987;&#20195;&#30721;&#19981;&#34987;&#26356;&#25913;&#12290;&#36825;&#37096;&#20998;&#21487;&#20197;&#20351;&#29992;&#21508;&#31181;&#19981;&#21516;&#30340;&#31574;&#30053;&#22788;&#29702;&#12290;
def merge_code_and_comments(original_file, comments_path):
    res = []
    ori_f = open(original_file, "r",encoding='utf-8')
    ori_lines = ori_f.readlines()

    com_f = open(comments_path, "r",encoding='utf-8')
    com_lines = com_f.readlines()
    len_com_lines = len(com_lines)
    p = 0
    j = 0
    for i, line in enumerate(ori_lines):
        if line.isspace():
            continue
        if line.strip()[0] == '#':
            res.append(line)
            continue
        while j < len_com_lines and line[:-1] not in com_lines[j]:
            j += 1
        if j < len_com_lines:
            p = j - 1
            up_comments = []
            triple_dot_flag = 0
            while p < j:
                if p < 0 or (res and res[-1] and com_lines[p] == res[-1]):
                    break
                if com_lines[p].strip() and (len(com_lines[p].strip())>3 and com_lines[p].strip()[-3:] == '"""' and com_lines[p].strip()[:3] == '"""') or (len(com_lines[p].strip())>3 and com_lines[p].strip()[-3:] == "'''" and com_lines[p].strip()[:3] == "'''"):
                    up_comments.append(com_lines[p])
                    p -= 1
                    continue
                if com_lines[p].strip() and (com_lines[p].strip()[-3:] == '"""' or com_lines[p].strip()[:3] == '"""' or com_lines[p].strip()[-3:] == "'''" or com_lines[p].strip()[:3] == "'''"):
                    triple_dot_flag = (triple_dot_flag + 1)%2
                    up_comments.append(com_lines[p])
                    p -= 1
                    continue
                if triple_dot_flag:
                    up_comments.append(com_lines[p])
                    p -= 1
                    continue
                if (com_lines[p].strip()=="") or (com_lines[p].strip() and com_lines[p].strip()[0] == '#' and "&#30465;&#30053;&#37096;&#20998;&#20869;&#23481;" not in com_lines[p]):
                    up_comments.append(com_lines[p])
                else:
                    break
                p -= 1
            if up_comments:
                res.extend(reversed(up_comments))
            if "#" in com_lines[j] and "#" not in line:
                in_line_comments = "  #" + com_lines[j].split("#")[-1]
                res.append(line[:-1]+in_line_comments)
            else:
                res.append(line)
            p = j+1
        else:
            res.append(line)
            j = p

    write_file(comments_path, "".join(res))

# &#22788;&#29702;&#21333;&#20010;&#25991;&#20214;
def deal_one_file(model, path, args):
    context = read_file(path)

    fname = path.split("/")[-1]
    fpath = "/".join(path.split("/")[:-1])
    outfname = fname.split(".")[0]+"_comments."+fname.split(".")[-1]

    comments_path = os.path.join(fpath, outfname)
    if (not args.regenerate) and os.path.exists(comments_path):
        print("use cache: ", comments_path)
        return

    context_line = len(context.split("\n"))
    if context_line < MaxLine:
        res = gen_code_comments(context, model = model)
    elif SplitKey[0] not in context:
        context_list = split_context_by_maxline(context)
        res = "\n".join([gen_code_comments(context_block, model = model) for context_block in context_list])
    else:
        context_list = split_context_by_splitkey(context)
        res = "\n".join([gen_code_comments(context_block, model = model) for context_block in context_list])

    write_file(comments_path, res)
    merge_code_and_comments(path, comments_path)

# &#22788;&#29702;&#25991;&#20214;&#22841;
def deal_folder(model, path, args):
    for fl in os.listdir(path):
        now_path = os.path.join(path, fl)
        if os.path.isfile(now_path):
            if (now_path.split(".")[-1] in CodeFileType) and ("_comments" not in now_path):
                deal_one_file(model, now_path, args)
        elif os.path.isdir(now_path):
            deal_folder(model, now_path, args)
        else:
            print("Please specify a correct path!")

def transfer(args):
    model = QWenChat()

    if os.path.isfile(args.path):
        if (args.path.split(".")[-1] in CodeFileType) and ("_comments" not in args.path):
            deal_one_file(model, args.path, args)
    elif os.path.isdir(args.path):
        deal_folder(model, args.path, args)
    else:
        print("Please specify a correct path!")

if __name__ == '__main__':
    args = parse_args()
    print(args)
    transfer(args)