há 1 ano atrás · 80e7a50e17
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -35,6 +35,5 @@ jobs:
 
				     - name: get-benchmark-result
			
 
				       run: |
			
 
				         echo "start test"
			
 
				-        cd $GITHUB_WORKSPACE/tests/benchmark/ 
			
 
				-        tree
			
 
				+        cd $GITHUB_WORKSPACE &&  pytest -s -v tests/test_cli/test_ben.py
			
 
				   
			
--- a/tests/benchmark/benchmark.py
+++ b/tests/benchmark/benchmark.py
@@ -1,73 +0,0 @@
 
				-"""
			
 
				-bench
			
 
				-"""
			
 
				-import os
			
 
				-import shutil
			
 
				-import json
			
 
				-import calculate_score
			
 
				-code_path = os.environ.get('GITHUB_WORKSPACE')
			
 
				-#评测集存放路径
			
 
				-pdf_dev_path = "datasets/"
			
 
				-#magicpdf跑测结果
			
 
				-pdf_res_path = "/tmp/magic-pdf"
			
 
				-
			
 
				-def test_cli():
			
 
				-    """
			
 
				-    test pdf-command cli
			
 
				-    """
			
 
				-    rm_cmd = f"rm -rf {pdf_res_path}"
			
 
				-    os.system(rm_cmd)
			
 
				-    os.makedirs(pdf_res_path)
			
 
				-    cmd = f'magic-pdf pdf-command --pdf {os.path.join(pdf_dev_path, "mineru")}'
			
 
				-    os.system(cmd)
			
 
				-    for root, dirs, files in os.walk(pdf_res_path):
			
 
				-         for magic_file in files:
			
 
				-            target_dir = os.path.join(pdf_dev_path, "mineru")
			
 
				-            if magic_file.endswith(".md"):
			
 
				-                source_file = os.path.join(root, magic_file)
			
 
				-                target_file = os.path.join(pdf_dev_path, "mineru", magic_file)
			
 
				-                if not os.path.exists(target_dir):
			
 
				-                    os.makedirs(target_dir) 
			
 
				-                shutil.copy(source_file, target_file)
			
 
				-
			
 
				-def get_score():
			
 
				-    """
			
 
				-    get score
			
 
				-    """
			
 
				-    data_path = os.path.join(pdf_dev_path, "ci")
			
 
				-    score = calculate_score.Scoring(os.path.join(data_path, "result.json"))
			
 
				-    score.calculate_similarity_total("mineru", data_path)
			
 
				-    res = score.summary_scores()
			
 
				-    return res
			
 
				-
			
 
				-
			
 
				-def ci_ben():
			
 
				-    """
			
 
				-    ci benchmark
			
 
				-    """
			
 
				-    try:
			
 
				-        fr = open(os.path.join(pdf_dev_path, "result.json"), "r", encoding="utf-8")
			
 
				-        lines = fr.readlines()
			
 
				-        last_line = lines[-1].strip()
			
 
				-        last_score = json.loads(last_line)
			
 
				-        print ("last_score:", last_score)
			
 
				-        last_simscore = last_score["average_sim_score"]
			
 
				-        last_editdistance = last_score["average_edit_distance"]
			
 
				-        last_bleu = last_score["average_bleu_score"]
			
 
				-    except IOError:
			
 
				-        print ("result.json not exist")
			
 
				-    test_cli()
			
 
				-    os.system(f"python pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}")
			
 
				-    now_score = get_score()
			
 
				-    print ("now_score:", now_score)
			
 
				-    now_simscore = now_score["average_sim_score"]
			
 
				-    now_editdistance = now_score["average_edit_distance"]
			
 
				-    now_bleu = now_score["average_bleu_score"]
			
 
				-    assert last_simscore <= now_simscore
			
 
				-    assert last_editdistance <= now_editdistance
			
 
				-    assert last_bleu <= now_bleu
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    os.system("sh env.sh")
			
 
				-    ci_ben()
			
--- a/tests/benchmark/calculate_score.py
+++ b/tests/benchmark/calculate_score.py
@@ -1,114 +0,0 @@
 
				-"""
			
 
				-calculate_score
			
 
				-"""
			
 
				-import os
			
 
				-import re
			
 
				-import json
			
 
				-import scoring
			
 
				-from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
			
 
				-from nltk.tokenize import word_tokenize
			
 
				-from Levenshtein import distance
			
 
				-
			
 
				-class Scoring:
			
 
				-    """
			
 
				-    calculate_score 
			
 
				-    """
			
 
				-    def __init__(self, result_path):
			
 
				-        """
			
 
				-        init
			
 
				-        """
			
 
				-        self.edit_distances = []
			
 
				-        self.bleu_scores = []
			
 
				-        self.sim_scores = []
			
 
				-        self.filenames = []
			
 
				-        self.score_dict = {}
			
 
				-        self.anntion_cnt = 0
			
 
				-        self.fw = open(result_path, "w+", encoding='utf-8')
			
 
				-
			
 
				-    def simple_bleu_score(self, candidate, reference):
			
 
				-        """
			
 
				-        get bleu score
			
 
				-        """
			
 
				-        candidate_tokens = word_tokenize(candidate)
			
 
				-        reference_tokens = word_tokenize(reference)
			
 
				-        return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1)
			
 
				-
			
 
				-
			
 
				-    def preprocess_string(self, s):
			
 
				-        """
			
 
				-        preprocess_string
			
 
				-        """
			
 
				-        sub_enter = re.sub(r'\n+', '\n', s)
			
 
				-        return re.sub(r'  ', ' ', sub_enter)
			
 
				-    
			
 
				-    def calculate_similarity(self, annotion, actual, tool_type):
			
 
				-        """
			
 
				-        calculate_similarity
			
 
				-        """
			
 
				-        class_dict = {}
			
 
				-        edit_distances = []
			
 
				-        bleu_scores = []
			
 
				-        sim_scores = list()
			
 
				-        total_file = 0
			
 
				-        for filename in os.listdir(annotion):
			
 
				-            if filename.endswith('.md') and not filename.startswith('.'):
			
 
				-                total_file = total_file + 1
			
 
				-                with open(os.path.join(annotion, filename), 'r', encoding='utf-8') as file_a:
			
 
				-                    content_a = file_a.read()
			
 
				-                self.anntion_cnt = self.anntion_cnt + 1
			
 
				-                filepath_b = os.path.join(actual, filename)
			
 
				-                if os.path.exists(filepath_b):
			
 
				-                    with open(filepath_b, 'r', encoding='utf-8') as file_b:
			
 
				-                        content_b = file_b.read()
			
 
				-                        self.filenames.append(filename)
			
 
				-                        edit_dist = distance(self.preprocess_string(content_b),self.preprocess_string(content_a)) / max(len(content_a), len(content_b))
			
 
				-                        self.edit_distances.append(edit_dist)
			
 
				-                        edit_distances.append(edit_dist)
			
 
				-                        bleu_score = self.simple_bleu_score(content_b, content_a)
			
 
				-                        bleu_scores.append(bleu_score)
			
 
				-                        self.bleu_scores.append(bleu_score)
			
 
				-                        score = scoring.score_text(content_b, content_a)
			
 
				-                        sim_scores.append(score)
			
 
				-                        self.sim_scores.append(score)
			
 
				-                        class_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
			
 
				-                        self.score_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
			
 
				-                else:  
			
 
				-                    print(f"File {filename} not found in actual directory.")
			
 
				-        class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0
			
 
				-        class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
			
 
				-        class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
			
 
				-        self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
			
 
				-        ratio = len(class_dict)/total_file
			
 
				-        self.fw.write(f"{tool_type} extract ratio:  {ratio}" + "\n")
			
 
				-        self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
			
 
				-        self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
			
 
				-        self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
			
 
				-        print (f"{tool_type} extract ratio: {ratio}")
			
 
				-        print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
			
 
				-        print (f"{tool_type} Average BLEU Score: {class_average_bleu_score}")
			
 
				-        print (f"{tool_type} Average Sim Score: {class_average_sim_score}")
			
 
				-        return self.score_dict
			
 
				-    
			
 
				-    def summary_scores(self):
			
 
				-        """
			
 
				-        calculate the average of edit distance, bleu score and sim score
			
 
				-        """
			
 
				-        over_all_dict = dict()
			
 
				-        average_edit_distance = sum(self.edit_distances) / len(self.edit_distances) if self.edit_distances else 0  
			
 
				-        average_bleu_score = sum(self.bleu_scores) / len(self.bleu_scores) if self.bleu_scores else 0  
			
 
				-        average_sim_score = sum(self.sim_scores) / len(self.sim_scores) if self.sim_scores else 0
			
 
				-        over_all_dict["average_edit_distance"] = average_edit_distance
			
 
				-        over_all_dict["average_bleu_score"] = average_bleu_score
			
 
				-        over_all_dict["average_sim_score"] = average_sim_score
			
 
				-        self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
			
 
				-        return over_all_dict
			
 
				-
			
 
				-    def calculate_similarity_total(self, tool_type, download_dir):
			
 
				-        """
			
 
				-        calculate the average of edit distance, bleu score and sim score
			
 
				-        """
			
 
				-        annotion = os.path.join(download_dir, "annotations", "cleaned")
			
 
				-        actual = os.path.join(download_dir, tool_type, "cleaned")
			
 
				-        score = self.calculate_similarity(annotion, actual, tool_type)
			
 
				-        return score
			
 
				-
			
--- a/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_0b2c9c91f5232541a7ace8984df306b2.md
+++ b/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_0b2c9c91f5232541a7ace8984df306b2.md
--- a/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_f7904bc37cc2e25c1e3e412978854b10.md
+++ b/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_f7904bc37cc2e25c1e3e412978854b10.md
--- a/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_fbdb99151e811688574c0c4c67341074.md
+++ b/tests/benchmark/datasets/annotations/cleaned/cleaned_academic_literature_fbdb99151e811688574c0c4c67341074.md
@@ -1,221 +0,0 @@
 
				-# Artificial Intelligence for 6G Networks: Technology Advancement and Standardization 
			
 
				-
			
 
				-Muhammad K. Shehzad, Luca Rose, M. Majid Butt, István Z. Kovács, Mohamad Assaad, and Mohsen Guizani
			
 
				-
			
 
				-Abstract—With the deployment of 5G networks, standards organizations have started working on the design phase for sixth-generation ( $6 \mathrm{G}$ ) networks. $6 \mathrm{G}$ networks will be immensely complex, requiring more deployment time, cost and management efforts. On the other hand, mobile network operators demand these networks to be intelligent, self-organizing, and cost-effective to reduce operating expenses (OPEX). Machine learning (ML), a branch of artificial intelligence (AI), is the answer to many of these challenges providing pragmatic solutions, which can entirely change the future of wireless network technologies. By using some case study examples, we briefly examine the most compelling problems, particularly at the physical (PHY) and link layers in cellular networks where ML can bring significant gains. We also review standardization activities in relation to the use of ML in wireless networks and future timeline on readiness of standardization bodies to adapt to these changes. Finally, we highlight major issues in ML use in the wireless technology, and provide potential directions to mitigate some of them in $6 \mathrm{G}$ wireless networks.
			
 
				-
			
 
				-Index Terms-AI, ML, Wireless networks, 3GPP, 6G.
			
 
				-
			
 
				-## I. INTRODUCTION
			
 
				-
			
 
				-Unprecedented growth in the global cellular traffic (as shown in Fig. 1) and immense data rate demands have become a challenge, leading wireless industry to the next-generation, called 6G. 6G-era will bring digital, physical and biological worlds together with the goal to improve human experience and well-being. $6 \mathrm{G}$ will be operating in TeraHertz $(\mathrm{THz})$ frequencies $(0.1-10 \mathrm{THz})$, hence beneficial for multiple use cases in industrial applications, providing immense data rates $(\approx 1 \mathrm{~Tb} / \mathrm{s})$, accelerating internet-of-things, and wider network coverage. AI/ML will pave the way for $\mathrm{THz}$ communications at different layers [2], e.g., supporting channel acquisition [3] and modulation classification [4] at PHY. Similarly, at the link layer, beamforming design and channel allocation can exploit ML [2]. In $\mathrm{THz}$ systems, a channel can significantly vary at a micrometer scale, resulting in a tremendous increase in channel estimation frequency and corresponding overhead. ML algorithms can counter this issue by using, e.g., improved channel prediction techniques [3], [5].
			
 
				-
			
 
				-
			
 
				-
			
 
				-Fig. 1. Estimation of global mobile subscriptions in machine-to-machine (M2M) and mobile broadband (MBB) from 2020 to 2030. Source: ITU-R Report M. $2370-0$ [1].
			
 
				-
			
 
				-Recently, fast-growing deployment of $5 \mathrm{G}$ has opened up many challenges, including massive complexity in network architecture, low latency, high cost, power consumption, and deployment of hybrid Long-Term Evolution (LTE) new radio $(\mathrm{NR})$, leading to difficulties in network optimization. In such a complex scenario, the network intelligence has become a major focus as it will play a pivotal role in complex problem solving [6], e.g., self-healing, self-optimization, and self-configuration of a network [7].
			
 
				-
			
 
				-Future networks will become "cognitive" in a way that many aspects such as spectrum sensing/sharing, slicing, radio resource management (RRM), and mobility management, will be ML-based. Further, it is expected that ML will impact 6G air interface fundamentally and it will be designed to support ML natively [8]. Several recent research attempts, e.g., [9], propose different road maps for 6G, but they do not address standardization timeline and related issues regarding application of ML in 6G. Albeit, to some extent, [10] gives an overview of ML and standardization; nevertheless, ML-related technical challenges and its applications from an industrial and standardization perspective are not addressed.
			
 
				-
			
 
				-Reconfigurable intelligent surface (RIS) and non-orthogonal multiple access (NOMA) are two key technologies for 6G [11]. RIS can re-engineer electromagnetic waves, hence beneficial to deliver the information where obstacles block the destination. RIS can be integrated with ML, allowing RIS to acquire envi-ronmental information by configuring various sensors, while ML can learn dynamic parameters intelligently, reducing the computation cost of RIS-based networks. Similarly, NOMA is a promising access technique for $6 \mathrm{G}$. In ML-empowered NOMA-based networks, gNodeBs ( $\mathrm{gNB}$ ) can intelligently define their control policy and improve decision-making ability.
			
 
				-
			
 
				-
			
 
				-
			
 
				-Fig. 2. An overview of ML paradigms, major tools, and applications in wireless networks.
			
 
				-
			
 
				-Today's networks use model-based methods to optimize various network functions providing characteristics of the process involved. However, these models might be too complex to be implemented in a realistic time frame or they include a great level of abstraction to function in a general environment. In contrast, ML-based solutions can adapt to real-time (RT) scenario changes and localized characteristics, learning the specific environment around the transceivers. The contributions of this article are twofold:
			
 
				-
			
 
				-- We look at the above-mentioned problems from an industrial perspective and outline the gap between research and practice.
			
 
				-- We review standardization activities in the context of adopting ML in various aspects of wireless communications, e.g., channel acquisition, positioning. Furthermore, we highlight major issues and possible research directions in relation to the use of ML in wireless networks.
			
 
				-
			
 
				-## II. OVERVIEW OF ML TECHNIQUES IN WIRELESS NETWORKS
			
 
				-
			
 
				-ML is a process of training machines through data without explicit programming. Broadly speaking, ML consists of three paradigms: unsupervised learning, supervised learning, and reinforcement learning (RL). All these paradigms have a training/exploration phase to optimize a learning algorithm that later can be used in prediction/exploitation phase to infer on unknown inputs. As shown in Fig. 2, we briefly summarize them by providing some use cases in wireless networks.
			
 
				-
			
 
				-1) Supervised Learning: Supervised learning exploits a labelled data set to learn a (hidden) function that maps an input to an expected output based on the examples. The standard techniques used to solve supervised learning-based problems are artificial neural networks (ANNs), support vector machines (SVMs), Bayesian networks, recurrent neural networks (RNNs), and convolutional neural networks (CNNs).
			
 
				-2) Unsupervised Learning: Unsupervised learning does not learn from labelled data, instead, training is based on an unlabelled data set. K-means and principal component analysis (PCA) are examples of two major tools used for clustering and dimensionality reduction, respectively.
			
 
				-3) Reinforcement Learning: RL is not based on training but rather the agent/decision-maker learns and decides online, maximizing a long-term reward. RL is beneficial in control problems where the agent adapts to changing environmental conditions, e.g., uplink power control.
			
 
				-
			
 
				-Motivated by the considerable benefits of ML in various fields, its applications have also been considered in wireless networks almost at all layers of communication. Here, we focus on its impact on radio access networks (RAN), particularly PHY and link layers. Based on ML tools, given in Fig.2, some case studies will be explained later in Section III.
			
 
				-
			
 
				-## A. Machine Learning at PHY
			
 
				-
			
 
				-At PHY, many optimization problems are non-convex, e.g., sum-rate maximization. ML is a powerful tool to find good solution(s) for such non-convex optimization problems. Based on advanced learning algorithms, 6G networks provide the following major advantages by using ML.
			
 
				-
			
 
				-- ML can be effective to deal with network complexity. 6G networks will be more complex due to numerous network topologies, immense growth in the cellular users, staggering data rate demands, complex air interface, vast network coordination methods, etc. Forecasting considerable complexity of $6 \mathrm{G}$ networks, the derivation of optimum performance solutions is nearly infeasible without ML.
			
 
				-- ML can play a vital role to deal with model deficit problems. Current cellular networks are amenable for mathematical derivation, for instance, information theory gives closed-form expressions for various problems such as Shannon theorem. However, the inherent complexity of $6 \mathrm{G}$ networks hinders the possibility of exploiting closed-form analytical expression(s), which can be due, for instance, to non-linearities either in the channel or network devices. ML offers an efficient way to deal with non-linearities, providing feasible solution(s) in a tractable manner.
			
 
				-- ML can cope with algorithm deficit problems. In current cellular networks, many optimal algorithms, although well-characterized, are impractical to be implemented. Considering the example of multiple-input multipleoutput (MIMO) systems where optimal solutions are known (e.g., dirty paper coding), they are overlooked in favour of linear solutions, e.g., linear minimum meansquared error. It is envisaged that ML can pave the way to implement more efficient yet practical solutions.
			
 
				-
			
 
				-ML has been used to study various PHY issues, and without being exhaustive, some of the recent areas include:
			
 
				-
			
 
				-- CNNs are used for modulation classification in [4].
			
 
				-- An RNN-based wireless channel predictor [5] is used in [3], explained in Section III-C to deal with inaccurate channel state information (CSI).
			
 
				-
			
 
				-## III. Wireless Networks: Case Studies
			
 
				-
			
 
				-In this section, we present three use cases to demonstrate the use of ML techniques in industrial wireless networks. ML tools utilized for these use cases are depicted in Fig. 2.
			
 
				-
			
 
				-## A. UE Positioning
			
 
				-
			
 
				-Highly accurate user equipment (UE) positioning is one of the prime considerations for Third Generation Partnership Project (3GPP) studies beyond Release 15. Various angle and time-of-arrival-based methods are used to determine UE positioning in today's cellular networks. All of these methods require triangulation techniques to resolve UE position and suffer from time synchronization errors.
			
 
				-
			
 
				-We studied UE position by using radio frequency (RF) fingerprinting and two ML techniques, namely deep learning and decision tree, for an outdoor scenario [12]. Serving cell Reference Signal Received Power (RSRP) as well as neighbor cell RSRP values were used as features to train a deep neural network (DNN). As shown in Fig. 3, nearly $5 \mathrm{~m}$ accuracy is achieved for DNN when only 4 serving cell RSRP values and corresponding beam IDs are considered as a feature input, while it improves to nearly $1 \mathrm{~m}$ when 2 more RSRP values from the strongest neighboring cells, respective cell and beam IDs are added to the input feature set. The decision tree, a less complex algorithm as compared to DNN, provides about $2 \mathrm{~m}$ accuracy using data from both serving and neighboring cell beams as an input feature. The mean accuracy of nearly $1 \mathrm{~m}$ obtained from DNN is comparable to the accuracy level achieved with traditional methods without requiring triangulation and does not suffer from signal timing synchronization issues.
			
 
				-
			
 
				-## B. ML-Assisted Proactive Mobility
			
 
				-
			
 
				-For seamless and efficient mobility, a well optimized network should reduce the number of Handover (HO) events while avoiding Handover Failures (HOF) and Radio Link Failures (RLF). An emerging approach is to utilize ML-based algorithms, which enable proactive and UE specific mobility actions in the gNB. A relatively simple approach to this is to design an ML-based estimator of the radio measurements, such as RSRP of serving and neighbor cells, with a certain minimum accuracy and within a certain time horizon. Radio measurements are traditionally performed at the UEs side and reported to the serving $\mathrm{gNB}$ (or gNB-Centralized Unit) according to specific Radio Resource Control (RRC) configurations. For ML-based prediction purposes, time-traces of RSRP, or Reference Signal Received Quality (RSRQ) values need to be collected either in the UE and/or serving the gNB.
			
 
				-
			
 
				-
			
 
				-
			
 
				-Fig. 3. Comparison of UE position for both DNN and decision tree techniques. The system level parameters for the network includes 8 sites with Inter-site distance $110 \mathrm{~m}$ and carrier frequency $28 \mathrm{GHz}$. For details of the parameters, please refer to [12].
			
 
				-
			
 
				-For example, collected time-series of RSRP values are used as input to the ML-based predictor, which provides at the UE, and/or at the serving $\mathrm{gNB}$, a set of sufficiently accurately estimated RSRP values within a given future time horizon. Then, these signal estimations are used for predictive evaluation of possible $\mathrm{HO}$ conditions, thus can trigger proactive measurement reports from the UE and/or proactive $\mathrm{HO}$ actions at the serving $\mathrm{gNB}$. These two steps are repeated with a time periodicity given, e.g., by the sampling rate and time filtering of the input RSRP measurements [13], or alternatively, the steps can also be triggered by the serving $\mathrm{gNB}$ when certain traffic or mobility Quality-of-Service (QoS) conditions are met.
			
 
				-
			
 
				-The outlined ML-based mobility algorithm can be implemented in either the UE or gNB or both, depending on the available ML assistance capabilities in each node. Furthermore, the mechanism can be integrated in self-organizing network-based Mobility Robustness Optimization solutions.
			
 
				-
			
 
				-## C. CSI Feedback
			
 
				-
			
 
				-CSI feedback in the downlink channel is a major challenge in Release 17 and beyond. Currently, CSI precision is affected by compressing the measurements imposed by the standard.
			
 
				-
			
 
				-In our study, summarized in Section II-A, we assumed two RNN-based twin channel predictors at the $\mathrm{gNB}$ and UE [3]. The past CSI is utilized for training the RNN at both ends of the communication system. UE's feedback is evaluated with respect to the predicted channel. Fig. 4 depicts the meansquared error (MSE) between the actual channel versus the acquired channel at the $\mathrm{gNB}$ and the precoding gain when different quantization bits are used to feedback the CSI from the UE. The results are compared with and without using ML for the CSI feedback. A clear benefit of using ML can be observed. We believe that ML-based solutions will improve current performance without increasing signaling overhead.
			
 
				-
			
 
				-
			
 
				-
			
 
				-(a) Trend of MSE.
			
 
				-
			
 
				-
			
 
				-
			
 
				-(b) Trend of precoding gain.
			
 
				-
			
 
				-Fig. 4. Performance of MSE and precoding gain. $2 \times 1$ MIMO configuration is considered, and RNN is composed of 1 hidden layer. For parameters' details, refer to [3].
			
 
				-
			
 
				-## IV. Role of ML in Standardization
			
 
				-
			
 
				-The potential of ML for $5 \mathrm{G}$ has been widely acknowledged in the literature and applications made it even in the standard at higher levels, e.g., for networking and security [7]. 3GPP has introduced a specification, named network data analytics function (NWDAF), in Release 15 and 16, as part of the $5 \mathrm{G}$ Core $(5 \mathrm{GC})$ architecture [7]. NWDAF is responsible for providing network analytics when requested by a network function (NF). Data is collected via application function (AF), operation, administration, and maintenance (OAM), NF, and data repositories. The specifications have also addressed the problem of inter-working for automation and data collection, which analytics vendors previously faced. 3GPP NWDAF framework for $5 \mathrm{G}$ systems is depicted in Fig.55. This automation gives leverage to network vendors for the deployment and testing of non-RT ML-related use cases. In Fig. 5. inward interfaces aggregate data from different network sources, where communication occurs using existing service-based interfaces. Outward interfaces provide decisions (analytics-based, algorithmic) to AF and NF.
			
 
				-
			
 
				-
			
 
				-
			
 
				-Fig. 5. A generalized framework for 5G network automation in Release 16, representing that NWDAF should be able to collect data from the operator OAM, AFs and $5 \mathrm{GC}$ network functions $[7]$.
			
 
				-
			
 
				-Regarding PHY, ML techniques lag behind, due to a number of issues. First, PHY makes use of abstractions and mathematical models that are inferred from the physical reality and electromagnetic principles. As long as such models describe the real-world precisely, there is no need for ML. Nevertheless, in practice, models and fixed algorithms are inefficient when facing rapidly changing and heterogeneous environments. For example, using the same channel acquisition scheme to acquire CSI from a laptop in line-of-sight with a $\mathrm{gNB}$, a tablet on a fast train, or a mobile quickly moving in a super densely covered area might not be optimal. Consequently, the standardization efforts of intelligent techniques have gained momentum, and while 3GPP is ready to begin a study item on ML implementations, open-radio access network (O-RAN) will be ML-native, defining a RAN intelligent controller (RIC), which will enhance several RAN functions.
			
 
				-
			
 
				-3GPP has started studying the implications of the ML use at layer-1 and a study item on ML for NR air interface has been agreed upon. After the RAN-1 working group studies, protocol aspects will be studied in RAN-2 and subsequently, interoperability and testability aspects will be considered in RAN-4 working group. The remaining part of this section summarizes the status of the standardization of ML techniques for PHY for both 3GPP and O-RAN.
			
 
				-
			
 
				-## A. CSI Feedback
			
 
				-
			
 
				-CSI feedback for downlink channel in Release 17 is a complex issue in which UE-based beam selection is followed by CSI reference symbols (RS) training and precoding matrix index (PMI) reporting, and lastly by Demodulation Reference Signal (DMRS) and consequent estimation of the precoded channel. Broadly, beam selection aims to establish a sufficiently strong link budget between the UEs and the gNB. The CSI-RS is used for fine channel estimation, which is then fed back to the gNB to compute a precoder (eventually multiuser); finally, DMRS are precoded pilots that the UEs use to implement coherent demodulation. Currently, each of these phases is created following pre-established rules, with little to none room for intelligent behaviour. ML has been envisioned to possibly enhance each phase in a different way. Beam selection can be improved by intelligently correlating the beams with position or identity of the UEs. This would allow for a smart selection of the beams from the gNB side, thus avoiding brute-force selection. The CSI-RS can be enhanced by compressing the pilots and the PMI feedback exploiting ad hoc ML compressors. Furthermore, channel prediction techniques [5] can be used in order to pre-establish a baseline for the CSI feedback [3]. Other aspects that can be improved include frequency of pilots in both CSI-RS and DMRS, power and timing and CSI-RS port selection.
			
 
				-
			
 
				-## B. $R S-D M R S$
			
 
				-
			
 
				-Roughly speaking, DMRS are RS used for channel estimation to perform coherent demodulation. The correct estimation of the channel using such pilots have a strong impact on the performance in terms of bit-error-rate and thus block-errorrate. The role of the ML in such domain is twofold. First, it can be used to improve the performance of the channel estimation. Second, the ML can provide a smarter positioning of DMRS in order to reduce their frequency, hence reducing the overhead footprint in $6 \mathrm{G}$.
			
 
				-
			
 
				-## C. Positioning
			
 
				-
			
 
				-A precise positioning is one of the aspects that sees the largest improvement with respect to LTE's observed time difference of arrival (OTDOA) and uplink time difference of arrival (UTDOA), defined in Release 9 onward. Various aspects of $6 \mathrm{G}$ allow for precise positioning of the UE, such as large number of antenna elements at the $\mathrm{gNB}$, millimeter wave transmissions, dense network deployment. However, the methods based on angle-of-arrival and time-of-arrival fall short when non-line-of-sight scenarios are considered, in interference-limited scenarios. ML techniques, see Fig.2, are expected to help in improving the position by exploiting channel charting, hence learning the likely position of a UE based on a report, and multiplexing together information that carries positioning information but are hard to exploit in a classical way, such as CSI report and sounding reference signal maps.
			
 
				-
			
 
				-## D. Mobility Enhancements
			
 
				-
			
 
				-In 6G, frequent cell-selection, and frequent RSRP measurement could impact UEs' battery life. Furthermore, load balancing algorithms can use intelligent techniques that exploit the UE specific channel prediction, movement trajectory prediction and traffic demands prediction. Furthermore, the scenarios like fast-trains or non-terrestrial networks, will pose challenges to $\mathrm{HO}$ and conditional-HO operations. Novel solutions envisaged, compared to current 3GPP Release 17, include the use of UE specific ML-based predictive algorithms, addressed in Section III-B, designed to reduce paging errors and HO failures; thus, improve the overall QoS.
			
 
				-
			
 
				-## E. Standardization for ML Data Collection
			
 
				-
			
 
				-3GPP has started working on data collection for running ML algorithms in 5G networks [14]. The scope of such studies include identifying mechanisms to collect data from the network through minimization of drive test framework or further advanced enhancements. Furthermore, studies will focus on discussing hosting of ML models both for training as well as inference purposes at various network entities for various use cases and defining any new interfaces required for transporting data to the models.
			
 
				-
			
 
				-## F. Federated Learning Model Collection
			
 
				-
			
 
				-Training and prediction based on ML models will put an extra load on networks already transporting a large volume of data. Therefore, it is important to estimate the effect of model training and inference on network traffic, particularly for federated learning (FL) where UEs will act as distributed hosts [15]. The latency in collecting locally trained models is bounded in FL and network links should be able to meet delay budgets. This is particularly challenging in today's networks where a UE's own QoS requirements are already demanding and the FL model training and collection will further incur an extra burden on the network. Similarly, the split inference, where UEs cooperate with each other to perform joint inference, results in increasing the network traffic. 3GPP studies in Release 18 [15] will focus on the above mentioned issues to support training and inference for ML/FL models over wireless links.
			
 
				-
			
 
				-## G. O-RAN-RIC
			
 
				-
			
 
				-O-RAN alliance, aims to define a RAN network that is non-vendor specific, and that has an innate support for ML as an enabler for automation and OPEX savings. O-RAN alliance has defined interfaces for exchange of information in the protocol stack. To this end, in the O-RAN architecture, ML-assisted RAN intelligent controller (RIC) is included for network automation, for both scenarios, i.e., non-RT and RT. In the non-RT RIC, ML algorithms' training is done by using the data obtained at lower layers. However, the learning process remains slow; therefore, it is called non-RT RIC. Later, the learner is fed into the RT RIC, which utilizes the RT captured data to perform decisions online. Additionally, the functionality of non-RT includes policy management and higher layer procedure optimization. Therefore, the RAN or core-network can deploy such a mechanism based on the collected data.
			
 
				-
			
 
				-## V. Open Challenges and Roadmap for Deploying ML TECHNIQUES
			
 
				-
			
 
				-Though ML is a potential technology and enabler for nextgeneration wireless networks, several challenges related to its practical use are addressed below.
			
 
				-
			
 
				-## A. Data Availability and Benchmarking
			
 
				-
			
 
				-One of the foremost challenges in wireless networks is data availability. Data availability concerns the problem of identifying a common and accepted set of data (e.g., channel realizations) with the goal of testing and benchmarking ML algorithms. Such a problem is of a pivotal importance for standardization, where normally algorithms and proposals are tested using agreed underlying physical models (e.g., urban macrocells/microcells channel models), evaluation methodologies and calibrated simulators. Contrary to other fields, cellular networks have no standard data set to train and benchmark an ML algorithm. Therefore, a synthetic data set or software generated data set is of a predominant importance to train and benchmark ML algorithm(s), and to agree on a common evaluation methodology to rank proposition and standard algorithms.
			
 
				-
			
 
				-Identifying a set of key performance indicators in wireless networks is another crucial task for ML standardization. It is necessary to design a set of metrics to classify and rank ML algorithms and their performance. Classic approaches such as throughput and signal-to-interference-plus-noise ratio (SINR) might not be sufficient since a small improvement in these values might come at the cost of large complexity augmentation and exacerbated energy consumption.
			
 
				-
			
 
				-
			
 
				-
			
 
				-Fig. 6. Model collection for FL in a wireless network when some of the UEs have large blockage and use D2D communication for model transfer. Cluster-based UE selection is another solution for asynchronous model collection to meet network QoS requirements.
			
 
				-
			
 
				-## B. Selection of ML versus Non-ML Solutions
			
 
				-
			
 
				-ML tools are regarded as an implementation-oriented tool rather than a standard relevant aspect. The idea behind this relies on the fact that each vendor has the freedom to efficiently implement each aspect of the standard as long as the external interfaces are respected. A simple example of this is given in the CSI feedback, where a UE needs to select a specific PMI, but the standard does not specify any specific way in which this selection is performed. Recently, however, the idea of having ML dedicated message exchanges and performance that only an ML-aided algorithm can achieve has paved the way for standardization of ML algorithms [3]. This opens the door for several issues, e.g., will the standard impose a specific ML structure, classifying minimum performance and implementation structure, or will it remain far from the implementation? With regards to NNs, it is still open if hyperparameters are going to be left to vendor-specific implementation or will they be set by the standard.
			
 
				-
			
 
				-## C. Complexity of ML Algorithms
			
 
				-
			
 
				-Considering the limited battery life, storage, computational capability, and limited communication bandwidth in most cellular network entities, an ML model's cost-performance tradeoff becomes a fundamental issue. Another issue is the speed/time-steps at which the training and inference needs to be performed. Whereas hard-wired gNB have sufficient computational power to run complex ML algorithms, UEs need to face battery, heating and stringent complexity limits. Possible solutions to such issue include, but not limited to implementation of substitute rule-based algorithms at the UE side, migrating the load all on the $\mathrm{gNB}$ side.
			
 
				-
			
 
				-## D. Communication-aware Federated Learning
			
 
				-
			
 
				-Traditional ML models support centralized learning. Due to difficulties in collecting large amount of training data from the UEs, privacy issues and bandwidth bottleneck, FL has emerged as a promising solution. In FL, training is performed distributively over network devices, called local model hosts, and an application server on the network side acts as a central host to aggregate local models transmitted by the local learners. Typically, an application server host aggregates models only when updates are available from all the local learners, called synchronous model transfer. However, this is highly inefficient in wireless networks where links are unpredictable, local learners (UEs) are energy limited and have their own QoS requirements. Asynchronous model collection is the most viable solution for FL in wireless networks, where a subset of UEs is selected for a local model update in each round of model collection. However, UE selection in each round is a complex problem because UEs are energy limited and the network bandwidth is scarce, hindering collection of local models from all the UEs to represent independently and identically data collection. These mechanisms are usually vendor proprietary, but standardization still needs to define some common mechanisms for efficient model collection. As shown in Fig. 6. UE clustering and local device-to-device (D2D) communication for asynchronous model collection are possible solutions to decrease network communication and will require standardization support.
			
 
				-
			
 
				-## E. Stability and Adaptability of ML Techniques
			
 
				-
			
 
				-ML algorithms applied to wireless networks must be adaptive as they will have to deal with parameters that change dynamically. Particularly, the weights of the NN are evaluated online based on the trained data. However, this approach may not be applicable in wireless, and specifically in a standard, where coordination among entities belonging to different operators and provided by different vendors have to coexist, and in which the need for quick response could prevent one or the other solution. Possible solutions include: pre-trained $\mathrm{NN}$, or partially trained $\mathrm{NN}$ (i.e., $\mathrm{NN}$ in which the starting point is pre-set); cloud-based downloadable data set for $\mathrm{NN}$ training; codebook-based $\mathrm{NN}$, in which a codebook of different NNs is used and agreed upon between the gNB and UEs. Another related problem is to detect an outdated ML model with high inference error and replace it. Replacing an outdated model with a new model incurs further delay. Thus, there must be a proactive mechanism to adapt the ML model to network conditions such that network functions suffer minimum performance loss.
			
 
				-
			
 
				-## VI. Conclusion
			
 
				-
			
 
				-Motivated by the promise of the use of ML algorithms, we presented an overview of ML techniques to be used in 5G-Advanced and 6G wireless networks. Furthermore, we discussed the key roles of ML-based solutions from industrial and standardization perspectives. We also highlighted the practical challenges of deploying ML techniques in wireless networks and how to deal with them. Non-RT and higher layer ML-based solutions can be, and are, applied already in today's networks. Implementing RT ML solutions at PHY/MAC in 6G networks are the next big challenge in the research community. We believe that overcoming these challenges, both in research as well as at standardization levels, will pave the way for next-generation wireless communication to be effective and sustainable.
			
 
				-
			
 
				-## REFERENCES
			
 
				-
			
 
				-[1] I. Union, "IMT traffic estimates for the years 2020 to 2030," Report ITU, pp. 2370-0, 2015.
			
 
				-
			
 
				-[2] A.-A. A. Boulogeorgos, E. Yaqub, M. Di Renzo, A. Alexiou, R. Desai, and R. Klinkenberg, "Machine learning: A catalyst for $\mathrm{THz}$ wireless networks," Frontiers in Communications and Networks, p. 37, 2021.
			
 
				-
			
 
				-[3] M. K. Shehzad, L. Rose, and M. Assaad, "Dealing with CSI compression to reduce losses and overhead: An artificial intelligence approach," in 2021 IEEE International Conference on Communications Workshops (ICC Workshops), 2021, pp. 1-6.
			
 
				-
			
 
				-[4] T. O'Shea and J. Hoydis, "An introduction to deep learning for the physical layer," IEEE Transactions on Cognitive Communications and Networking, vol. 3, no. 4, pp. 563-575, 2017.
			
 
				-
			
 
				-[5] M. K. Shehzad, L. Rose, S. Wesemann, and M. Assaad, "ML-based massive MIMO channel prediction: Does it work on real-world data?" IEEE Wireless Communications Letters, pp. 1-5, 2022.
			
 
				-
			
 
				-[6] B. Mao, F. Tang, Y. Kawamoto, and N. Kato, "Optimizing computation offloading in satellite-UAV-served 6G IoT: A deep learning approach," IEEE Network, vol. 35, no. 4, pp. 102-108, 2021.
			
 
				-
			
 
				-[7] 3GPP, "Study of enablers for network automation for 5G (Release 16)," https://portal.3gpp.org/desktopmodules/Specifications/ SpecificationDetails.aspx?specificationId=3252, , Technical Report (TR) $23.791,062019$.
			
 
				-
			
 
				-[8] J. Hoydis, F. A. Aoudia, A. Valcarce, and H. Viswanathan, "Toward a 6G AI-native air interface," IEEE Communications Magazine, vol. 59, no. 5, pp. 76-81, 2021.
			
 
				-
			
 
				-[9] F. Tariq, M. R. Khandaker, K.-K. Wong, M. A. Imran, M. Bennis, and M. Debbah, "A speculative study on 6G," IEEE Wireless Communications, vol. 27, no. 4, pp. 118-125, 2020.
			
 
				-
			
 
				-[10] R. Shafin, L. Liu, V. Chandrasekhar, H. Chen, J. Reed, and J. C. Zhang, "Artificial intelligence-enabled cellular networks: A critical path to beyond-5G and 6G," IEEE Wireless Communications, vol. 27, no. 2, pp. 212-217, 2020.
			
 
				-
			
 
				-[11] R. Zhong, Y. Liu, X. Mu, Y. Chen, and L. Song, "AI empowered RISassisted NOMA networks: Deep learning or reinforcement learning?" IEEE Journal on Selected Areas in Communications, vol. 40, no. 1, pp. $182-196,2022$.
			
 
				-
			
 
				-[12] M. M. Butt, A. Pantelidou, and I. Z. Kovács, "ML-assisted UE positioning: performance analysis and 5G architecture enhancements," IEEE Open Journal of Vehicular Technology, vol. 2, pp. 377-388, 2021.
			
 
				-
			
 
				-[13] 3GPP, "NR; Radio Resource Control (RRC); Protocol specification (Release 15)," https://portal.3gpp.org/desktopmodules/Specifications/ SpecificationDetails.aspx?specificationId=3197 , Technical report (TR) TS38.331, 032021.
			
 
				-
			
 
				-[14] - , "Study on enhancement for data collection for NR and ENDC (Release 17)," https://portal.3gpp.org/desktopmodules/Specifications/ SpecificationDetails.aspx?specificationId=3817 , Technical report (TR) $37.817,012021$.
			
 
				-
			
 
				-[15] -, "5G System (5GS); Study on traffic characteristics and performance requirements for AI/ML model transfer (Release 18)," https://portal.3gpp.org/desktopmodules/Specifications/ SpecificationDetails.aspx?specificationId=3721 , Technical report (TR) $22.874,032021$.
			
 
				-
			
 
				-
			
 
				-
			
 
				-Muhammad K. Shehzad [S'21] is working as a Research Engineer and Ph.D. student at Nokia Bell-Labs and CentraleSupelec, Paris, France, respectively. He received his B.Eng. (Hons.) degree in Electrical and Electronic Engineering from the University of Bradford, Bradford, U.K., in 2016, and M.S. in Electrical Engineering from the National University of Sciences \& Technology (NUST), Islamabad, Pakistan, in 2019. His major research interest is in MIMO communication using Artificial Intelligence (AI)/Machine Learning (ML).
			
 
				-
			
 
				-
			
 
				-
			
 
				-Luca Rose [M'11] is Senior research and standard-ization expert with Nokia Bell-labs. He received his M.Sc. from university of Pisa, Italy, and his Ph.D. in Physics from Centrale-Supelec. He worked with Huawei France research center and Thales Communications and Security, contributing to several standard organizations. He is currently an ITU-R and ETSI delegate and the lead editor of IEEE Communication magazine series on IoT. His interests span from the field of AI/ML to Game theory.
			
 
				-
			
 
				-
			
 
				-
			
 
				-M. Majid Butt [SM'15] is a Senior Specialist at Nokia Bell-Labs, France, and an adjunct Professor at Trinity College Dublin, Ireland. He has authored more than 70 peer-reviewed conference and journal articles and filed over 30 patents. He is IEEE Comsoc distinguished lecturer for the class 2022-23. He frequently gives invited and technical tutorial talks on various topics in IEEE conferences and serves as an associate editor for IEEE Communication Magazine, IEEE Open Journal of the Communication Society and IEEE Open Journal of Vehicular Technology.
			
 
				-
			
 
				-
			
 
				-
			
 
				-István Z. Kovács [M’00] received his B.Sc. from "Politehnica" Technical University of Timişoara, Romania in 1989, his M.Sc.E.E. from École Nationale Supérieure des Télécommunications de Bretagne, France in 1996, and his Ph.D.E.E. in Wireless Communications from Aalborg University, Denmark in 2002. Currently he is senior research engineer at Nokia, Aalborg, Denmark, where he conducts research on machine learning-driven radio resource management and radio connectivity enhancements for non-terrestrial and aerial vehicle communications, in LTE and 5G networks.
			
 
				-
			
 
				-
			
 
				-
			
 
				-Mohamad Assaad [SM'15] is a Professor at CentraleSupelec, France and a researcher at the Laboratory of Signals and Systems (CNRS). He has coauthored 1 book and more than 120 journal and conference papers and serves regularly as TPC cochair for top-tier international conferences. He is currently an Editor for the IEEE Wireless Communications Letters and Journal of Communications and Information Networks. His research interests include 5G and beyond systems, and Machine Learning in wireless networks.
			
 
				-
			
 
				-
			
 
				-
			
 
				-Mohsen Guizani [F'09] is currently a Professor at the Machine Learning Department at the Mohamed Bin Zayed University of Artificial Intelligence (MBZUAI), Abu Dhabi, UAE. His main research interests are wireless communications and IoT security. He was elevated to the IEEE Fellow in 2009. He was listed as a Clarivate Analytics Highly Cited Researcher in Computer Science in 2019, 2020 and 2021. Dr. Guizani has won several research awards. He is the author of ten books and more than 800 publications.
			
--- a/tests/benchmark/datasets/annotations/cleaned/cleaned_ordinary_textbook_1d9a847603a5e37e379738316820850d.md
+++ b/tests/benchmark/datasets/annotations/cleaned/cleaned_ordinary_textbook_1d9a847603a5e37e379738316820850d.md
@@ -1,28 +0,0 @@
 
				-# 数学新星问题征解 
			
 
				-
			
 
				-第十五期 (2016.06)
			
 
				-
			
 
				-主持: 牟晓生
			
 
				-
			
 
				-第一题. 设 $z_{1}, z_{2}, z_{3}$ 是单位复数. 证明存在单位复数 $z$ 使得:
			
 
				-
			
 
				-$$
			
 
				-\frac{1}{\left|z-z_{1}\right|^{2}}+\frac{1}{\left|z-z_{2}\right|^{2}}+\frac{1}{\left|z-z_{3}\right|^{2}} \leq \frac{9}{4}
			
 
				-$$
			
 
				-
			
 
				-(湖北武钢三中学生 王逸轩, 上海大学冷岗松 供题)
			
 
				-
			
 
				-第二题. 如图, $D$ 是正三角形 $A B C$ 的边 $B C$ 上一点, $B D>C D$. 记 $O_{1}, I_{1}$ 为 $\triangle A B D$ 的外心与内心, $O_{2}, I_{2}$ 为 $\triangle A C D$ 的外心与内心. 圆 $I_{1}$ 与圆 $I_{2}$ 除 $B C$外的另一条外公切线交 $A B, A C$ 于 $P, Q$. 设直线 $P I_{1}$与 $Q I_{2}$ 交于 $R$, 而直线 $O_{1} I_{1}$ 与 $O_{2} I_{2}$ 交于 $T$. 证明: $A T^{2}=A R^{2}+A D \cdot B C$.
			
 
				-
			
 
				-(广西钦州 卢圣 供题)
			
 
				-
			
 
				-
			
 
				-
			
 
				-第三题. 给定正整数 $m, n$, 考虑在 $m \times n$ 白棋盘上先将一些格染成黑色. 在之后的每一时刻, 若存在一个白格至少与两个黑格相邻, 则可将它也染成黑色. 求最初至少要染多少个黑色格才能在某一时刻染黑整个棋盘?
			
 
				-
			
 
				-(哈佛大学 牟晓生 供题)
			
 
				-
			
 
				-第四题. $A B C$ 是一个三角形, 而 $P, Q, R$ 分别是 $B C, C A, A B$ 上的点。证明 $\triangle P Q R$ 的周长不小于 $\triangle A Q R, \triangle B R P, \triangle C P Q$ 周长的最小值.
			
 
				-
			
 
				-(哈佛大学 牟晓生 供题)
			
 
				-
			
--- a/tests/benchmark/datasets/annotations/cleaned/cleaned_research_report_1f978cd81fb7260c8f7644039ec2c054.md
+++ b/tests/benchmark/datasets/annotations/cleaned/cleaned_research_report_1f978cd81fb7260c8f7644039ec2c054.md
@@ -1,246 +0,0 @@
 
				-## 增持（维持）
			
 
				-
			
 
				-所属行业：机械设备
			
 
				-
			
 
				-当前价格(元): 82.42
			
 
				-
			
 
				-## 证券分析师
			
 
				-
			
 
				-倪正洋
			
 
				-
			
 
				-资格编号：S0120521020003
			
 
				-
			
 
				-邮箱: nizy@tebon.com.cn
			
 
				-
			
 
				-## 研究助理
			
 
				-
			
 
				-杨云道
			
 
				-
			
 
				-邮箱: yangyx@tebon.com.cn
			
 
				-
			
 
				-
			
 
				-
			
 
				-| 沪深 300 对比 | $1 \mathrm{M}$ | $2 \mathrm{M}$ | $3 \mathrm{M}$ |
			
 
				-| :--- | ---: | ---: | ---: |
			
 
				-| 绝对涨幅(\%) | 7.18 | 32.88 | 80.86 |
			
 
				-| 相对涨幅(\%) | 8.10 | 25.93 | 78.39 |
			
 
				-
			
 
				-资料来源: 德邦研究所, 聚源数据
			
 
				-
			
 
				-## 相关研究
			
 
				-
			
 
				-1.《高测股份 (688556): 光伏金刚线及硅片切割代工业务推动公司 22Q1 业绩大超预期》, 2022.4.29
			
 
				-
			
 
				-2.《光伏设备: 光伏高效电池扩产提速,关键设备商各领风骚》, 2022.4.10 3. 《高测股份 (688556.SH): 再签建湖 10GW 硅片切割代工产能，强化代工业务成长逻辑》, 2022.4.7
			
 
				-
			
 
				-3.《高测股份 (688556.SH): 签订晶澳曲靖 2.2 亿元切割设备合同，看好 22 年代工业绩释放+HJT 切割工艺进步》, 2022.3.9
			
 
				-
			
 
				-4.《高测股份 (688556.SH): 21 年业绩预告超市场预期，关注切片代工利润释放》, 2022.1.24
			
 
				-
			
 
				-# 高测股份 $(688556.5 H):$ 扩产 4000 万公里金刚线，强化光伏碰片切割三元布局
			
 
				-
			
 
				-## 投资要点
			
 
				-
			
 
				-- 事件：公司拟与蓝关县人民政府签署的《壶关年产 12000 万千米金刚线项目投资协议书》，项目一期计划建设年产 4,000万千米金刚线产能，预计一期总投资额约 6.66 亿元; 后续年产 8,000 万千米金刚线项目尚未具体约定，存在较大不确定性。
			
 
				-- 顺应下游需求扩张, 金刚线产能快速扩产, 保证公司内供+外销。光伏金刚线需求 22 年提升源于两方面：1）2022 年光伏产业链景气度高涨，1-5 月光伏装机同比 $+24.4 \%$, 带动产业链各环节开工率提升, 硅片前期扩产产能逐步落地, 金刚线需求释放；2）由于多晶硅料价格持续维持高位，细线化、薄片化趋势加速，其中细线化要求金刚线线径由 40 线、 38 线向 36 线、 35 线进步, 带动单 GW 切割线耗不断提升。目前 36 线单 GW 切割线耗约 50 万公里, 较 38 线提升约 $30 \%$ 。公司于 2021 年对金刚线进行 “ 1 机 12 线” 技改，技改完成后，公司 22 年 1 季度产能 712 万公里, 年化产能超 2500 万公里。公司目前切片代工产能约 47GW, 对应远期金刚线产能超 2300 万公里。本次扩产再一次扩充公司金刚线产能, 强化金刚线产能内供+外销布局。
			
 
				-- 依托萦关低成本电价提升金刚线盈利能力, 顺应硅料节约持续推动细线化布局。公司在山西长治金刚线生产厂区采购电力的平均单价较青岛金刚线生产厂区采购电力的平均单价低, 2020 年度公司陆续将青岛的金刚线生产线搬迁到山西长治並关厂区，随着山西长治金刚线生产厂区金刚线产量增加，公司采购电力的平均单价呈下降趋势。目前公司电力采购单价从 2019 年 0.8 元/kwh 降低到 2022 年 Q1 的 0.39 元/kwh，並关后续拓展有望进一步降低公司金刚线电价成本。金刚线线径越细，锯㖓越小，切割时产生的锯㖓硅料损失越少，同样一根硅棒可切割加工出的硅片数量越多，制造硅片所需的硅材料越少。相同切割工艺下，金刚线越细，固结在钢线基体上的金刚石微粉颗粒越小，切割加工时对硅片的表面损伤越小，硅片表面质量越好，砝片 TTV 等质量指标表现也就越好。金刚线母线直径已由 2016 年的 80um 降至 2022 年上半年的 36、38、40um，此外高线速、柔性化和智能化等均是金刚线及切片技术进步方向, 公司在薄片、细线化、高线速、柔性智能化方面均有领先布局, 推动切割工艺持续进步。
			
 
				-- 切割工艺的持续进步领先, 是保障公司利润释放的核心壁垒。公司光伏硅片切割三元布局包括硅片切割及机加工设备、砝片切割耗材 (金刚线) 以及切割代工业务。公司 2021 年依托前期设备+耗材布局切割代工业务, 目前已公布 47GW 产能 (乐山5GW 示范基地、乐山 20GW 大硅片及配套项目、建湖一期 10GW 项目,建湖二期 $12 \mathrm{GW}$ 项目), 客户包括通威、京运通、美科及建湖周边电池企业。22 年底公司有望实现超 20GW 切割代工产能, 且当前终端客户主要为下游电池企业。客户选择切割代工模式的核心在于凭借高测的专业化服务实现快速上产, 同时可获得较自建硅片切割产能或购买硅片更多的超额利润。超额利润的核心在于高测股份的切割代工技术领先, 可实现更多的硅片切割红利, 并与客户共享。未来随着金刚线扩产和切割技术进步, 公司光伏硅片切割代工利润弹性有望持续释放。
			
 
				-- 盈利预测与投资建议：预计公司 2022-2024 年归母净利润 4.7、7.2、9.3 亿元,对应 PE 30、20、15 倍，维持 “增持” 评级。
			
 
				-- 风险提示：硅片扩产不及预期，公司代工业务利润波动风险，市场竞争加剧。
			
 
				-
			
 
				-<table><thead><tr><th>股票数据</th><th></th></tr></thead><tr><td>总股本(百万股):</td><td>227.92</td></tr><tr><td>流通 A 股(百万股):</td><td>167.01</td></tr><tr><td>52 周内股价区间(元):</td><td>21.60-97.40</td></tr><tr><td>总市值(百万元):</td><td>18,785.44</td></tr><tr><td>总资产(百万元):</td><td>3,508.81</td></tr><tr><td>每股净资产(元):</td><td>5.50</td></tr><tr><td>咨料来源，公司公告</td><td></td></tr></table>
			
 
				-
			
 
				-<table><thead><tr><th>主要财务数据及预测</th><th></th><th></th><th></th><th></th><th></th></tr></thead><tr><td></td><td>2020</td><td>2021</td><td>2022E</td><td>2023E</td><td>2024E</td></tr><tr><td>营业收入(百万元)</td><td>746</td><td>1,567</td><td>3,684</td><td>5,056</td><td>5,752</td></tr><tr><td>(+/-)YOY(%)</td><td>4.5\%</td><td>110.0\%</td><td>135.1\%</td><td>37.2\%</td><td>13.8\%</td></tr><tr><td>净利润(百万元)</td><td>59</td><td>173</td><td>471</td><td>717</td><td>933</td></tr><tr><td>(+/-)YOY(%)</td><td>83.8\%</td><td>193.4\%</td><td>172.8\%</td><td>52.2\%</td><td>30.1\%</td></tr><tr><td>全面摊薄 EPS(元)</td><td>0.43</td><td>1.07</td><td>2.91</td><td>4.43</td><td>5.77</td></tr><tr><td>毛利率(\%)</td><td>35.3\%</td><td>33.7\%</td><td>35.0\%</td><td>36.0\%</td><td>38.0\%</td></tr><tr><td>净资产收益率(\%)</td><td>6.0\%</td><td>15.0\%</td><td>27.9\%</td><td>28.8\%</td><td>26.5\%</td></tr></table>
			
 
				-
			
 
				-资料来源: 公司年报 (2020-2021)，德邦研究所
			
 
				-
			
 
				-备注: 净利润为归属母公司所有者的净利润
			
 
				-
			
 
				-## 财务报表分析和预测
			
 
				-
			
 
				-| 主要财务指标 | 2021 | $2022 E$ | $2023 E$ | $2024 E$ |
			
 
				-| :--- | ---: | ---: | ---: | ---: |
			
 
				-| 每股指标(元) |  |  |  |  |
			
 
				-| 每股收益 | 1.07 | 2.91 | 4.43 | 5.77 |
			
 
				-| 每股净资产 | 7.13 | 10.43 | 15.39 | 21.76 |
			
 
				-| 每股经营现金流 | 0.47 | 1.27 | 4.07 | 5.02 |
			
 
				-| 每股股利 | 0.11 | 0.11 | 0.11 | 0.11 |
			
 
				-| 价值评估(倍) |  |  |  |  |
			
 
				-| P/E | 82.90 | 30.47 | 20.02 | 15.38 |
			
 
				-| P/B | 12.44 | 8.50 | 5.76 | 4.08 |
			
 
				-| P/S | 8.52 | 3.62 | 2.64 | 2.32 |
			
 
				-| EV/EBITDA | 49.85 | 24.12 | 15.68 | 11.46 |
			
 
				-| 股息率\% | $0.1 \%$ | $0.1 \%$ | $0.1 \%$ | $0.1 \%$ |
			
 
				-| 盈利能力指标(\%) |  |  |  |  |
			
 
				-| 毛利率 | $33.7 \%$ | $35.0 \%$ | $36.0 \%$ | $38.0 \%$ |
			
 
				-| 净利润率 | $11.0 \%$ | $12.8 \%$ | $14.2 \%$ | $16.2 \%$ |
			
 
				-| 净资产收益率 | $15.0 \%$ | $27.9 \%$ | $28.8 \%$ | $26.5 \%$ |
			
 
				-| 资产回报率 | $5.3 \%$ | $7.9 \%$ | $8.5 \%$ | $9.2 \%$ |
			
 
				-| 投资回报率 | $15.3 \%$ | $25.9 \%$ | $24.6 \%$ | $23.7 \%$ |
			
 
				-| 盈利增长(\%) |  |  |  |  |
			
 
				-| 营业收入增长率 | $110.0 \%$ | $135.1 \%$ | $37.2 \%$ | $13.8 \%$ |
			
 
				-| EBIT 增长率 | $233.7 \%$ | $150.7 \%$ | $52.3 \%$ | $31.9 \%$ |
			
 
				-| 净利润增长率 | $193.4 \%$ | $172.8 \%$ | $52.2 \%$ | $30.1 \%$ |
			
 
				-| 偿倩能力指标 |  |  |  |  |
			
 
				-| 资产负债率 | $64.3 \%$ | $71.5 \%$ | $70.6 \%$ | $65.3 \%$ |
			
 
				-| 流动比率 | 1.2 | 1.2 | 1.3 | 1.4 |
			
 
				-| 速动比率 | 0.9 | 0.9 | 1.0 | 1.1 |
			
 
				-| 现金比率 | 0.2 | 0.1 | 0.2 | 0.3 |
			
 
				-| 经营效率指标 |  |  |  |  |
			
 
				-| 应收怅款周转天数 | 161.7 | 165.1 | 164.9 | 164.4 |
			
 
				-| 存货周转天数 | 196.1 | 170.0 | 180.0 | 190.0 |
			
 
				-| 总资产周转率 | 0.5 | 0.6 | 0.6 | 0.6 |
			
 
				-| 固定资产周转率 | 4.2 | 8.6 | 10.3 | 11.1 |
			
 
				-
			
 
				-| 现金流量表(百万元) | 2021 | $2022 E$ | 2023E | 2024E |
			
 
				-| :--- | ---: | ---: | ---: | ---: |
			
 
				-| 净利润 | 173 | 471 | 717 | 933 |
			
 
				-| 少数股东损益 | 0 | 0 | 0 | 0 |
			
 
				-| 非现金支出 | 107 | 114 | 133 | 147 |
			
 
				-| 非经营收益 | 17 | 1 | 4 | 14 |
			
 
				-| 营运资金变动 | -220 | -382 | -195 | -283 |
			
 
				-| 经营活动现金流 | 76 | 205 | 658 | 812 |
			
 
				-| 资产 | -83 | -184 | -203 | -169 |
			
 
				-| 投资 | 229 | 0 | 0 | 0 |
			
 
				-| 其他 | 6 | 9 | 13 | 14 |
			
 
				-| 投资活动现金流 | 151 | -175 | -190 | -155 |
			
 
				-| 债权募资 | -80 | 39 | 321 | 64 |
			
 
				-| 股权募资 | 0 | 0 | 0 | 0 |
			
 
				-| 其他活 | -21 | -3 | -14 | -25 |
			
 
				-| 融资活动现金流 | -101 | 36 | 307 | 39 |
			
 
				-| 现金净流量 | 127 | 66 | 775 | 696 |
			
 
				-
			
 
				-备注: 表中计算估值指标的收盘价日期为 7 月 19 日
			
 
				-
			
 
				-资料来源: 公司年报 (2020-2021), 德邦研究所
			
 
				-
			
 
				-| 利润表(百万元) | 2021 | 2022E | 2023E | 2024E |
			
 
				-| :---: | :---: | :---: | :---: | :---: |
			
 
				-| 营业总收入 | 1,567 | 3,684 | 5,056 | 5,752 |
			
 
				-| 营业成本 | 1,038 | 2,394 | 3,236 | 3,567 |
			
 
				-| 毛利率\% | $33.7 \%$ | $35.0 \%$ | $36.0 \%$ | $38.0 \%$ |
			
 
				-| 营业税金及附加 | 6 | 18 | 25 | 29 |
			
 
				-| 营业税金率\% | $0.4 \%$ | $0.5 \%$ | $0.5 \%$ | $0.5 \%$ |
			
 
				-| 营业费用 | 63 | 147 | 193 | 209 |
			
 
				-| 营业费用率\% | $4.0 \%$ | $4.0 \%$ | $3.8 \%$ | $3.6 \%$ |
			
 
				-| 管理费用 | 131 | 313 | 409 | 444 |
			
 
				-| 管理费用率\% | $8.4 \%$ | $8.5 \%$ | $8.1 \%$ | $7.7 \%$ |
			
 
				-| 研发费用 | 117 | 276 | 379 | 431 |
			
 
				-| 研发费用率\% | $7.5 \%$ | $7.5 \%$ | $7.5 \%$ | $7.5 \%$ |
			
 
				-| EBIT | 213 | 534 | 814 | 1,074 |
			
 
				-| 财务费用 | 7 | 1 | 11 | 19 |
			
 
				-| 财务费用率\% | $0.4 \%$ | $0.0 \%$ | $0.2 \%$ | $0.3 \%$ |
			
 
				-| 资产减值损失 | -33 | -63 | -86 | -98 |
			
 
				-| 投资收益 | 5 | 9 | 13 | 14 |
			
 
				-| 营业利润 | 212 | 531 | 800 | 1,040 |
			
 
				-| 营业外收支 | -25 | -8 | -3 | -3 |
			
 
				-| 利润总额 | 187 | 523 | 797 | 1,037 |
			
 
				-| EBITDA | 282 | 582 | 865 | 1,129 |
			
 
				-| 所得税 | 14 | 52 | 80 | 104 |
			
 
				-| 有效所得税率\% | $7.7 \%$ | $10.0 \%$ | $10.0 \%$ | $10.0 \%$ |
			
 
				-| 少数股东损益 | 0 | 0 | 0 | $\mathbf{0}-1-2$ |
			
 
				-| 归属母公司所有者净利润 | 173 | 471 | 717 | 933 |
			
 
				-
			
 
				-| 资产负债表(百万元) | 2021 | 2022E | 2023E | $2024 E$ |
			
 
				-| :---: | :---: | :---: | :---: | :---: |
			
 
				-| 货币资金 | 427 | 494 | 1,269 | 1,965 |
			
 
				-| 应收账款及应收票据 | 1,173 | 2,806 | 3,798 | 4,344 |
			
 
				-| 存货 | 558 | 1,115 | 1,596 | 1,857 |
			
 
				-| 其它流动资产 | 266 | 578 | 736 | 778 |
			
 
				-| 流动资产合计 | 2,424 | 4,992 | 7,400 | 8,943 |
			
 
				-| 长期股权投资 | 0 | 0 | 0 | 0 |
			
 
				-| 固定资产 | 370 | 429 | 491 | 516 |
			
 
				-| 在建工程 | 169 | 183 | 205 | 226 |
			
 
				-| 无形资产 | 42 | 56 | 69 | 80 |
			
 
				-| 非流动资产合计 | 811 | 940 | 1,087 | 1,198 |
			
 
				-| 资产总计 | 3,235 | 5,932 | 8,487 | 10,141 |
			
 
				-| 短期借款 | 28 | 68 | 388 | 452 |
			
 
				-| 应付票据及应付账款 | 1,401 | 3,197 | 4,302 | 4,760 |
			
 
				-| 预收账款 | 0 | 0 | 0 | 0 |
			
 
				-| 其它流动负债 | 560 | 887 | 1,214 | 1,314 |
			
 
				-| 流动负债合计 | 1,989 | 4,152 | 5,904 | 6,527 |
			
 
				-| 长期借款 | 0 | 0 | 0 | 0 |
			
 
				-| 其它长期负债 | 92 | 92 | 92 | 92 |
			
 
				-| 非流动负债合计 | 92 | 92 | 92 | 92 |
			
 
				-| 负债总计 | 2,081 | 4,243 | 5,996 | 6,619 |
			
 
				-| 实收资本 | 162 | 162 | 162 | 162 |
			
 
				-| 普通股股东权益 | 1,154 | 1,688 | 2,491 | 3,522 |
			
 
				-| 少数股东权益 | 0 | 0 | 0 | 0 |
			
 
				-| 负债和所有者权益合计 | 3,235 | 5,932 | 8,487 | 10,141 |
			
 
				-
			
 
				-## 信息披露
			
 
				-
			
 
				-## 分析师与研究助理简介
			
 
				-
			
 
				-倪正洋，2021 年加入德邦证券，任研究所大制造组组长、机械行业首席分析师，拥有 5 年机械研究经验，1 年高端装备产业经验，南京大学材料学学士、上海交通大学材料学硕士。2020 年获得 iFinD 机械行业最具人气分析师, 所在团队曾获机械行业 2019 年新财富第三名，2017 年新财富第二名，2017 年金牛奖第二名，2016 年新财富第四名。
			
 
				-
			
 
				-## 分析师声明
			
 
				-
			
 
				-本人具有中国证券业协会授予的证券投资咨询执业资格，以勤勉的职业态度，独立、客观地出具本报告。本报告所采用的数据和信息均来自市场公开信息, 本人不保证该等信息的准确性或完整性。分析逻辑基于作者的职业理解，清晰准确地反映了作者的研究观点，结论不受任何第三方的授意或影响，特此声明。
			
 
				-
			
 
				-## 投资评级说明
			
 
				-
			
 
				-1.投资评级的比较和评级标准:
			
 
				-
			
 
				-以报告发布后的 6 个月内的市场表现为比较标准，报告发布日后 6 个月内的公司股价（或行业指数）的张跌幅相对同期市场基准指数的涨跌幅;
			
 
				-
			
 
				-2.市场基准指数的比较标准:
			
 
				-
			
 
				-A 股市场以上证综指或深证成指为基准；香港市场以恒生指数为基准；美国市场以标普 500 或纳斯达克综合指数为基准。
			
 
				-
			
 
				-<table>
			
 
				-    <tr>
			
 
				-        <td rowspan="11">1. 投资评级的比较和评级标准: 以报告发布后的 6 个月内的市场表 现为比较标准，报告发布日后 6 个 月内的公司股价(或行业指数)的 涨跌幅相对同期市场基准指数的涨 跌幅：<br> 2. 市场基准指数的比较标准: A股市场以上证综指或深证成指为基 准; 香港市场以恒生指数为基准; 美 国市场以标普500或纳斯达克综合指 数为基准。</td>
			
 
				-    </tr>
			
 
				-    <tr>
			
 
				-        <td>类型</td>
			
 
				-        <td>评级</td>
			
 
				-        <td>说明</td>
			
 
				-    </tr>
			
 
				-        <td rowspan="5">股票评级</td>
			
 
				-    </tr>
			
 
				-    <tr>
			
 
				-        <td>买入</td>
			
 
				-        <td>相对强于市场表现 20%以上;</td>
			
 
				-    </tr>
			
 
				-    <tr>
			
 
				-        <td>增持</td>
			
 
				-        <td>相对强于市场表现 5% 20%;</td>
			
 
				-    </tr>
			
 
				-    <tr>
			
 
				-        <td>中性</td>
			
 
				-        <td>相对市场表现在-5% +5%之间波动；</td>
			
 
				-    </tr>
			
 
				-    <tr>
			
 
				-        <td>减持</td>
			
 
				-        <td>相对弱于市场表现 5%以下。</td>
			
 
				-    </tr>
			
 
				-    <tr>
			
 
				-        <td rowspan="4">行业投资评级</td>
			
 
				-    </tr>
			
 
				-    <tr>
			
 
				-        <td>优于大市</td>
			
 
				-        <td>预期行业整体回报高于基准指数整体水平10%以上;</td>
			
 
				-    </tr>
			
 
				-    <tr>
			
 
				-        <td>中性</td>
			
 
				-        <td>预期行业整体回报介于基准指数整体水平-10%与 10%之间;</td>
			
 
				-    </tr>
			
 
				-    <tr>
			
 
				-        <td>弱于大市</td>
			
 
				-        <td>预期行业整体回报低于基准指数整体水平 10%以下。</td>
			
 
				-    </tr>
			
 
				-    <tr>
			
 
				-</table>
			
 
				-
			
 
				-## 法律声明
			
 
				-
			
 
				-本报告仅供德邦证券股份有限公司（以下简称 “本公司”）的客户使用。本公司不会因接收人收到本报告而视其为客户。在任何情况下，本报告中的信息或所表述的意见并不构成对任何人的投资建议。在任何情况下，本公司不对任何人因使用本报告中的任何内容所引致的任何损失负任何责任。
			
 
				-
			
 
				-本报告所载的资料、意见及推测仅反映本公司于发布本报告当日的判断，本报告所指的证券或投资标的的价格、价值及投资收入可能会波动。在不同时期，本公司可发出与本报告所载资料、意见及推测不一致的报告。
			
 
				-
			
 
				-市场有风险，投资需谨慎。本报告所载的信息、材料及结论只提供特定客户作参考，不构成投资建议，也没有考虑到个别客户特殊的投资目标、财务状况或需要。客户应考虑本报告中的任何意见或建议是否符合其特定状况。在法律许可的情况下，德邦证券及其所属关联机构可能会持有报告中提到的公司所发行的证券并进行交易，还可能为这些公司提供投资银行服务或其他服务。
			
 
				-
			
 
				-本报告仅向特定客户传送，未经德邦证券研究所书面授权，本研究报告的任何部分均不得以任何方式制作任何形式的拷贝、复印件或复制品，或再次分发给任何其他人，或以任何侵犯本公司版权的其他方式使用。所有本报告中使用的商标、服务标记及标记均为本公司的商标、服务标记及标记。如欲引用或转载本文内容, 务必联络德邦证券研究所并获得许可, 并需注明出处为德邦证券研究所，且不得对本文进行有悖原意的引用和删改。
			
 
				-
			
 
				-根据中国证监会核发的经营证券业务许可，德邦证券股份有限公司的经营范围包括证券投资咨询业务。
			
--- a/tests/benchmark/datasets/pdf/academic_literature_0b2c9c91f5232541a7ace8984df306b2.pdf
+++ b/tests/benchmark/datasets/pdf/academic_literature_0b2c9c91f5232541a7ace8984df306b2.pdf
--- a/tests/benchmark/datasets/pdf/academic_literature_f7904bc37cc2e25c1e3e412978854b10.pdf
+++ b/tests/benchmark/datasets/pdf/academic_literature_f7904bc37cc2e25c1e3e412978854b10.pdf
--- a/tests/benchmark/datasets/pdf/academic_literature_fbdb99151e811688574c0c4c67341074.pdf
+++ b/tests/benchmark/datasets/pdf/academic_literature_fbdb99151e811688574c0c4c67341074.pdf
--- a/tests/benchmark/datasets/pdf/ordinary_textbook_1d9a847603a5e37e379738316820850d.pdf
+++ b/tests/benchmark/datasets/pdf/ordinary_textbook_1d9a847603a5e37e379738316820850d.pdf
--- a/tests/benchmark/datasets/pdf/research_report_1f978cd81fb7260c8f7644039ec2c054.pdf
+++ b/tests/benchmark/datasets/pdf/research_report_1f978cd81fb7260c8f7644039ec2c054.pdf
--- a/tests/benchmark/env.sh
+++ b/tests/benchmark/env.sh
@@ -1,4 +0,0 @@
 
				-conda create -n MinerU python=3.10
			
 
				-conda activate MinerU
			
 
				-pip install magic-pdf
			
 
				-#cp magic-pdf.template.json ~/magic-pdf.json
			
--- a/tests/benchmark/magic-pdf.json
+++ b/tests/benchmark/magic-pdf.json
@@ -1,9 +0,0 @@
 
				-{
			
 
				-    "bucket_info":{
			
 
				-        "bucket-name-1":["ak", "sk", "endpoint"],
			
 
				-        "bucket-name-2":["ak", "sk", "endpoint"]
			
 
				-    },
			
 
				-    "temp-output-dir":"/tmp",
			
 
				-    "models-dir":"/tmp/models",
			
 
				-    "device-mode":"cpu"
			
 
				-}
			
--- a/tests/benchmark/pre_clean.py
+++ b/tests/benchmark/pre_clean.py
@@ -1,131 +0,0 @@
 
				-"""
			
 
				-clean data
			
 
				-"""
			
 
				-import argparse
			
 
				-import os
			
 
				-import re
			
 
				-import htmltabletomd # type: ignore
			
 
				-import pypandoc
			
 
				-import argparse
			
 
				-
			
 
				-parser = argparse.ArgumentParser(description="get tool type")
			
 
				-parser.add_argument(
			
 
				-    "--tool_name",
			
 
				-    type=str,
			
 
				-    required=True,
			
 
				-    help="input tool name",
			
 
				-)
			
 
				-parser.add_argument(
			
 
				-    "--download_dir",
			
 
				-    type=str,
			
 
				-    required=True,
			
 
				-    help="input download dir",
			
 
				-)
			
 
				-args = parser.parse_args()
			
 
				-
			
 
				-def clean_markdown_images(content):
			
 
				-    """
			
 
				-    clean markdown images
			
 
				-    """
			
 
				-    pattern = re.compile(r'!\[[^\]]*\]\([^)]*\)', re.IGNORECASE)  
			
 
				-    cleaned_content = pattern.sub('', content)   
			
 
				-    return cleaned_content
			
 
				-   
			
 
				-def clean_ocrmath_photo(content):
			
 
				-    """
			
 
				-    clean ocrmath photo
			
 
				-    """
			
 
				-    pattern = re.compile(r'\\includegraphics\[.*?\]\{.*?\}', re.IGNORECASE)  
			
 
				-    cleaned_content = pattern.sub('', content)   
			
 
				-    return cleaned_content
			
 
				-
			
 
				-def convert_html_table_to_md(html_table):
			
 
				-    """
			
 
				-    convert html table to markdown table
			
 
				-    """
			
 
				-    lines = html_table.strip().split('\n')  
			
 
				-    md_table = ''  
			
 
				-    if lines and '<tr>' in lines[0]:  
			
 
				-        in_thead = True  
			
 
				-        for line in lines:  
			
 
				-            if '<th>' in line:  
			
 
				-                cells = re.findall(r'<th>(.*?)</th>', line)  
			
 
				-                md_table += '| ' + ' | '.join(cells) + ' |\n'  
			
 
				-                in_thead = False  
			
 
				-            elif '<td>' in line and not in_thead:  
			
 
				-                cells = re.findall(r'<td>(.*?)</td>', line)  
			
 
				-                md_table += '| ' + ' | '.join(cells) + ' |\n'  
			
 
				-        md_table = md_table.rstrip() + '\n'    
			
 
				-    return md_table  
			
 
				- 
			
 
				-def convert_latext_to_md(content):
			
 
				-    """
			
 
				-    convert latex table to markdown table
			
 
				-    """
			
 
				-    tables = re.findall(r'\\begin\{tabular\}(.*?)\\end\{tabular\}', content, re.DOTALL)  
			
 
				-    placeholders = []  
			
 
				-    for table in tables:  
			
 
				-        placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"  
			
 
				-        replace_str = f"\\begin{{tabular}}{table}cl\\end{{tabular}}"
			
 
				-        content = content.replace(replace_str, placeholder)  
			
 
				-        try:
			
 
				-            pypandoc.convert_text(replace_str,  format="latex", to="md", outputfile="output.md", encoding="utf-8")
			
 
				-        except:
			
 
				-            markdown_string = replace_str
			
 
				-        else: 
			
 
				-            markdown_string = open('output.md', 'r', encoding='utf-8').read()
			
 
				-        placeholders.append((placeholder, markdown_string)) 
			
 
				-    new_content = content  
			
 
				-    for placeholder, md_table in placeholders:  
			
 
				-        new_content = new_content.replace(placeholder, md_table)  
			
 
				-        # 写入文件  
			
 
				-    return new_content
			
 
				-
			
 
				- 
			
 
				-def convert_htmltale_to_md(content):
			
 
				-    """
			
 
				-    convert html table to markdown table
			
 
				-    """
			
 
				-    tables = re.findall(r'<table>(.*?)</table>', content, re.DOTALL)  
			
 
				-    placeholders = []
			
 
				-    for table in tables:
			
 
				-        placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"  
			
 
				-        content = content.replace(f"<table>{table}</table>", placeholder)  
			
 
				-        try:
			
 
				-            convert_table = htmltabletomd.convert_table(table)
			
 
				-        except:
			
 
				-            convert_table = table
			
 
				-        placeholders.append((placeholder,convert_table)) 
			
 
				-    new_content = content  
			
 
				-    for placeholder, md_table in placeholders:  
			
 
				-        new_content = new_content.replace(placeholder, md_table)  
			
 
				-        # 写入文件  
			
 
				-    return new_content
			
 
				-
			
 
				-def clean_data(prod_type, download_dir):
			
 
				-    """
			
 
				-    clean data
			
 
				-    """
			
 
				-    tgt_dir = os.path.join(download_dir, prod_type, "cleaned")
			
 
				-    if not os.path.exists(tgt_dir):  
			
 
				-        os.makedirs(tgt_dir) 
			
 
				-    source_dir = os.path.join(download_dir, prod_type)
			
 
				-    filenames = os.listdir(source_dir)
			
 
				-    for filename in filenames:
			
 
				-        if filename.endswith('.md'):
			
 
				-            input_file = os.path.join(source_dir, filename)
			
 
				-            output_file = os.path.join(tgt_dir, "cleaned_" + filename)
			
 
				-            with open(input_file, 'r', encoding='utf-8') as fr:
			
 
				-                content = fr.read()
			
 
				-                new_content = clean_markdown_images(content)
			
 
				-                new_content = convert_html_table_to_md(new_content)
			
 
				-                new_content = convert_latext_to_md(new_content)
			
 
				-                new_content = convert_htmltale_to_md(new_content)
			
 
				-                with open(output_file, 'w', encoding='utf-8') as fw:
			
 
				-                    fw.write(new_content)
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    tool_type = args.tool_name
			
 
				-    download_dir = args.download_dir
			
 
				-    clean_data(tool_type, download_dir)
			
--- a/tests/benchmark/result.json
+++ b/tests/benchmark/result.json
@@ -1 +0,0 @@
 
				-{"average_sim_score":0, "average_edit_distance":0, "average_bleu_score": 0}
			
--- a/tests/benchmark/scoring.py
+++ b/tests/benchmark/scoring.py
@@ -1,48 +0,0 @@
 
				-import math
			
 
				-
			
 
				-from rapidfuzz import fuzz
			
 
				-import re
			
 
				-import regex
			
 
				-from statistics import mean
			
 
				-
			
 
				-CHUNK_MIN_CHARS = 25
			
 
				-
			
 
				-def chunk_text(text, chunk_len=500):
			
 
				-    chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)]
			
 
				-    chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS]
			
 
				-    return chunks
			
 
				-
			
 
				-
			
 
				-def overlap_score(hypothesis_chunks, reference_chunks):
			
 
				-    if len(reference_chunks) > 0:
			
 
				-        length_modifier = len(hypothesis_chunks) / len(reference_chunks)
			
 
				-    else:
			
 
				-        length_modifier = 0
			
 
				-    search_distance = max(len(reference_chunks) // 5, 10)
			
 
				-    chunk_scores = []
			
 
				-    for i, hyp_chunk in enumerate(hypothesis_chunks):
			
 
				-        max_score = 0
			
 
				-        total_len = 0
			
 
				-        i_offset = int(i * length_modifier)
			
 
				-        chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance))
			
 
				-        for j in chunk_range:
			
 
				-            ref_chunk = reference_chunks[j]
			
 
				-            score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100
			
 
				-            if score > max_score:
			
 
				-                max_score = score
			
 
				-                total_len = len(ref_chunk)
			
 
				-        chunk_scores.append(max_score)
			
 
				-    return chunk_scores
			
 
				-
			
 
				-
			
 
				-def score_text(hypothesis, reference):
			
 
				-    # Returns a 0-1 alignment score
			
 
				-    hypothesis_chunks = chunk_text(hypothesis)
			
 
				-    reference_chunks = chunk_text(reference)
			
 
				-    chunk_scores = overlap_score(hypothesis_chunks, reference_chunks)
			
 
				-    if len(chunk_scores) > 0:
			
 
				-        mean_score = mean(chunk_scores)
			
 
				-        return mean_score
			
 
				-    else:
			
 
				-        return 0
			
 
				-    #return mean(chunk_scores)
			
--- a/tests/test_cli/conf/conf.py
+++ b/tests/test_cli/conf/conf.py
@@ -2,6 +2,6 @@ import os
 
				 conf = {
			
 
				 "code_path": os.environ.get('GITHUB_WORKSPACE'),
			
 
				 "pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev",
			
 
				-"pdf_res_path": "/tmp"
			
 
				+"pdf_res_path": "/tmp/magic-pdf"
			
 
				 }
			
 
				 
			
--- a/tests/test_cli/pdf_dev/14a75ee1-b88a-4fe7-bb10-62cbfabbfdec.html.json
+++ b/tests/test_cli/pdf_dev/14a75ee1-b88a-4fe7-bb10-62cbfabbfdec.html.json
--- a/tests/test_cli/pdf_dev/14a75ee1-b88a-4fe7-bb10-62cbfabbfdec.html.pdf
+++ b/tests/test_cli/pdf_dev/14a75ee1-b88a-4fe7-bb10-62cbfabbfdec.html.pdf
--- a/tests/test_cli/pdf_dev/2365839d-4116-45de-b2f0-3a740e1d6c20.html.json
+++ b/tests/test_cli/pdf_dev/2365839d-4116-45de-b2f0-3a740e1d6c20.html.json
--- a/tests/test_cli/pdf_dev/2365839d-4116-45de-b2f0-3a740e1d6c20.html.pdf
+++ b/tests/test_cli/pdf_dev/2365839d-4116-45de-b2f0-3a740e1d6c20.html.pdf
--- a/tests/test_cli/pdf_dev/24cb61a0-cace-460a-a42b-495a86caf88f.html.json
+++ b/tests/test_cli/pdf_dev/24cb61a0-cace-460a-a42b-495a86caf88f.html.json
--- a/tests/test_cli/pdf_dev/24cb61a0-cace-460a-a42b-495a86caf88f.html.pdf
+++ b/tests/test_cli/pdf_dev/24cb61a0-cace-460a-a42b-495a86caf88f.html.pdf
--- a/tests/test_cli/pdf_dev/300970fd-b34a-4656-a334-23059595b360.html.json
+++ b/tests/test_cli/pdf_dev/300970fd-b34a-4656-a334-23059595b360.html.json
--- a/tests/test_cli/pdf_dev/300970fd-b34a-4656-a334-23059595b360.html.pdf
+++ b/tests/test_cli/pdf_dev/300970fd-b34a-4656-a334-23059595b360.html.pdf
--- a/tests/test_cli/pdf_dev/40c595b5-3b62-4021-b8dd-5e445d223c47.html.json
+++ b/tests/test_cli/pdf_dev/40c595b5-3b62-4021-b8dd-5e445d223c47.html.json
--- a/tests/test_cli/pdf_dev/40c595b5-3b62-4021-b8dd-5e445d223c47.html.pdf
+++ b/tests/test_cli/pdf_dev/40c595b5-3b62-4021-b8dd-5e445d223c47.html.pdf
--- a/tests/test_cli/pdf_dev/416b8524-9a6f-4b49-b7d4-56ce5c825699.html.json
+++ b/tests/test_cli/pdf_dev/416b8524-9a6f-4b49-b7d4-56ce5c825699.html.json
--- a/tests/test_cli/pdf_dev/416b8524-9a6f-4b49-b7d4-56ce5c825699.html.pdf
+++ b/tests/test_cli/pdf_dev/416b8524-9a6f-4b49-b7d4-56ce5c825699.html.pdf
--- a/tests/test_cli/pdf_dev/658cbc48-9edd-4537-8b02-261c052a2845.html.json
+++ b/tests/test_cli/pdf_dev/658cbc48-9edd-4537-8b02-261c052a2845.html.json
--- a/tests/test_cli/pdf_dev/658cbc48-9edd-4537-8b02-261c052a2845.html.pdf
+++ b/tests/test_cli/pdf_dev/658cbc48-9edd-4537-8b02-261c052a2845.html.pdf
--- a/tests/test_cli/pdf_dev/789b3b75-b5ad-49c2-8ba1-e8719f7a1d42.html.json
+++ b/tests/test_cli/pdf_dev/789b3b75-b5ad-49c2-8ba1-e8719f7a1d42.html.json
--- a/tests/test_cli/pdf_dev/789b3b75-b5ad-49c2-8ba1-e8719f7a1d42.html.pdf
+++ b/tests/test_cli/pdf_dev/789b3b75-b5ad-49c2-8ba1-e8719f7a1d42.html.pdf
--- a/tests/test_cli/pdf_dev/9eb3c6a7-1564-4a10-8cfb-56c628e46208.html.json
+++ b/tests/test_cli/pdf_dev/9eb3c6a7-1564-4a10-8cfb-56c628e46208.html.json
--- a/tests/test_cli/pdf_dev/9eb3c6a7-1564-4a10-8cfb-56c628e46208.html.pdf
+++ b/tests/test_cli/pdf_dev/9eb3c6a7-1564-4a10-8cfb-56c628e46208.html.pdf
--- a/tests/test_cli/pdf_dev/b80cbc13-6655-42a8-a3a1-fe2db6eff883.html.json
+++ b/tests/test_cli/pdf_dev/b80cbc13-6655-42a8-a3a1-fe2db6eff883.html.json
--- a/tests/test_cli/pdf_dev/b80cbc13-6655-42a8-a3a1-fe2db6eff883.html.pdf
+++ b/tests/test_cli/pdf_dev/b80cbc13-6655-42a8-a3a1-fe2db6eff883.html.pdf
--- a/tests/test_cli/pdf_dev/bb72581d-bcbd-419c-ba55-a26af7c7f00d.html.json
+++ b/tests/test_cli/pdf_dev/bb72581d-bcbd-419c-ba55-a26af7c7f00d.html.json
--- a/tests/test_cli/pdf_dev/bb72581d-bcbd-419c-ba55-a26af7c7f00d.html.pdf
+++ b/tests/test_cli/pdf_dev/bb72581d-bcbd-419c-ba55-a26af7c7f00d.html.pdf
--- a/tests/test_cli/pdf_dev/ef36fc6f-d521-49b6-9846-85e565404632.html.json
+++ b/tests/test_cli/pdf_dev/ef36fc6f-d521-49b6-9846-85e565404632.html.json
--- a/tests/test_cli/pdf_dev/ef36fc6f-d521-49b6-9846-85e565404632.html.pdf
+++ b/tests/test_cli/pdf_dev/ef36fc6f-d521-49b6-9846-85e565404632.html.pdf
--- a/tests/test_cli/pdf_dev/p3_图文混排84.json
+++ b/tests/test_cli/pdf_dev/p3_图文混排84.json
--- a/tests/test_cli/pdf_dev/p3_图文混排84.pdf
+++ b/tests/test_cli/pdf_dev/p3_图文混排84.pdf
--- a/tests/test_cli/test_cli.py
+++ b/tests/test_cli/test_cli.py
@@ -6,35 +6,27 @@ from lib import common
 
				 import logging
			
 
				 import os
			
 
				 import json
			
 
				-
			
 
				 from loguru import logger
			
 
				-
			
 
				 from magic_pdf.pipe.UNIPipe import UNIPipe
			
 
				 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
			
 
				 pdf_res_path = conf.conf["pdf_res_path"]
			
 
				 code_path = conf.conf["code_path"]
			
 
				 pdf_dev_path = conf.conf["pdf_dev_path"]
			
 
				 class TestCli:
			
 
				-   
			
 
				-    def test_pdf_specify_dir(self):
			
 
				-        """
			
 
				-        输入pdf和指定目录的模型结果
			
 
				-        """
			
 
				-        cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py  pdf-command  --pdf {}' % (code_path, pdf_dev_path)
			
 
				-        logging.info(cmd)
			
 
				-        common.check_shell(cmd)
			
 
				-        #common.count_folders_and_check_contents(pdf_res_path)      
			
 
				-   
			
 
				+    """
			
 
				+    test cli
			
 
				+    """
			
 
				     def test_pdf_sdk(self):
			
 
				         """
			
 
				         pdf sdk 方式解析
			
 
				         """
			
 
				         demo_names = list()
			
 
				-        for pdf_file in os.listdir(pdf_dev_path):
			
 
				+        pdf_path = os.path.join(pdf_dev_path, "pdf")
			
 
				+        for pdf_file in os.listdir(pdf_path):
			
 
				             if pdf_file.endswith('.pdf'):
			
 
				                 demo_names.append(pdf_file.split('.')[0])
			
 
				         for demo_name in demo_names:
			
 
				-            model_path = os.path.join(pdf_dev_path, f"{demo_name}.json")
			
 
				+            model_path = os.path.join(pdf_dev_path, f"{demo_name}_model.json")
			
 
				             pdf_path = os.path.join(pdf_dev_path, f"{demo_name}.pdf")
			
 
				             pdf_bytes = open(pdf_path, "rb").read()
			
 
				             model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
			
@@ -45,9 +37,11 @@ class TestCli:
 
				             pipe.pipe_classify()
			
 
				             pipe.pipe_parse()
			
 
				             md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
			
 
				-            with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
			
 
				+            res_path = os.path.join(pdf_dev_path, "miner", f"{demo_name}.md")
			
 
				+            with open(res_path, "w", encoding="utf-8") as f:
			
 
				                 f.write(md_content)
			
 
				 
			
 
				+        
			
 
				     # def test_pdf_specify_jsonl(self):
			
 
				     #     """
			
 
				     #     输入jsonl, 默认方式解析
		`@@ -1 +0,0 @@`
		`-{"average_sim_score":0, "average_edit_distance":0, "average_bleu_score": 0}`