StochasticGPT/paper_class.py at master · shiyu-coder/StochasticGPT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import copy
import os
import re

import json
import traceback

import joblib

from llm_api import GPT, parse_json, llm_request
from langchain.document_loaders.text import TextLoader
from langchain.text_splitter import LatexTextSplitter
from util import multiprocess, get_cpu_count

root_cache_path = "../StochasticGPT_data/cache"


def load_from_cache(file_name):
    if f"{file_name}.pkl" in os.listdir(root_cache_path):
        ls_cache = joblib.load(f"{root_cache_path}/{file_name}.pkl")
        paper = Paper()
        paper.load_cache(ls_cache)
        paper.file_name = file_name
        return paper
    return None


def save_cache(paper):
    ls_cache = paper.get_cache()
    joblib.dump(ls_cache, f"{root_cache_path}/{paper.file_name}.pkl")


def replace_section_content(latex_content, section_title, new_content):
    # 转义为正则表达式的特殊字符
    escaped_title = re.escape(section_title)

    # 构建查找section或subsection的正则表达式
    section_regex = re.compile(
        r"(\\section\{" + escaped_title + r"\}|\\subsection\{" + escaped_title + r"\})"
                                                                                 ".*?"
                                                                                 r"(?=\\section|\\subsection|$)",
        re.DOTALL
    )

    # 在文档中查找对应的section或subsection
    match = section_regex.search(latex_content)
    if match:
        start_index = match.start()
        end_index = match.end()

        # 替换section内容
        replaced_content = (
                latex_content[:start_index] +
                match.group(1) + "\n" + new_content + "\n" +
                latex_content[end_index:]
        )

        return replaced_content
    else:
        # 没有找到对应的section或subsection
        print("Section or Subsection not found.")
        return latex_content


def get_polishing_paper(paper_content, dt_polishing_result):
    polishing_paper = copy.deepcopy(paper_content)
    for section_label in dt_polishing_result.keys():
        section_polishing_result = dt_polishing_result[section_label]
        if len(section_polishing_result) > 0:
            polishing_paper = replace_section_content(polishing_paper, section_label, section_polishing_result)
    return polishing_paper


class Paper:

    def __init__(self):
        self.file_name = None
        self.title = None
        self.paper_content = None
        self.overall_structure = None
        self.dt_section_structure = None
        self.dt_section_content = None
        self.dt_analysis_result = None
        self.paper_score = None
        self.dt_score = None
        self.dt_polishing_result = None

    def initial_polishing_result(self):
        self.dt_polishing_result = {}
        for section_label in self.dt_section_content.keys():
            self.dt_polishing_result[section_label] = ""

    def get_cache(self):
        return [self.title, self.paper_content, self.overall_structure, self.dt_section_structure, self.dt_section_content,
                self.dt_analysis_result, self.paper_score, self.dt_score, self.dt_polishing_result]

    def load_cache(self, ls_cache):
        self.title, self.paper_content, self.overall_structure, self.dt_section_structure, self.dt_section_content, self.dt_analysis_result, self.paper_score, self.dt_score, self.dt_polishing_result = ls_cache


if __name__ == '__main__':
    file_name = 'DCU-AQ.tex_lan'
    # ls_cache = joblib.load(f"{root_cache_path}/{file_name}")
    ls_cache = joblib.load(f"exp_result/our_method/{file_name}.pkl")
    paper = Paper()
    paper.load_cache(ls_cache)
    paper.file_name = file_name
    print(paper)
    print(paper.dt_polishing_result)

    # title = "Managing Large Dataset Gaps in Urban Air Quality Prediction: DCU-Insight-AQ at MediaEval 2022"
    # dt_polishing_result = {}
    # for section_label in ls_cache[4].keys():
    #     dt_polishing_result[section_label] = ""
    # joblib.dump([*ls_cache, dt_polishing_result], f"{root_cache_path}/{file_name}")