from bs4 import BeautifulSoup from collections import Counter def find_repeating_pattern(html_content): soup = BeautifulSoup(html_content, 'html.parser') def get_tag_signature(tag): attrs = ' '.join(f"{k}='{v}'" for k, v in sorted(tag.attrs.items())) return f"{tag.name}[{attrs}]" def analyze_structure(tag): signature = get_tag_signature(tag) children = tuple(analyze_structure(child) for child in tag.children if isinstance(child, BeautifulSoup.Tag)) return (signature, children) structures = [analyze_structure(tag) for tag in soup.find_all()] pattern_counter = Counter(structures) most_common_pattern, count = pattern_counter.most_common(1)[0] if count > 1: return most_common_pattern[0] else: return None # 사용 예 html_content = """ <p class="img"> <p> <div> </div> </p> </p> <p class="img"> <p> <div> </div> </p> </p> """ repeating_pattern = find_repeating_pattern(html_content) print(f"가장 많이 반복되는 패턴의 상위 태그: {repeating_pattern}")
import requests from bs4 import BeautifulSoup from collections import Counter def find_repeating_patterns(url, threshold=3): response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') def get_element_signature(element): attrs = ''.join(f" {k}='{v}'" for k, v in sorted(element.attrs.items())) return f"{element.name}{attrs}" def find_patterns(element, depth=0, path=''): signature = get_element_signature(element) current_path = f"{path}/{signature}" patterns.append(current_path) if depth < max_depth: for child in element.children: if child.name: find_patterns(child, depth + 1, current_path) patterns = [] max_depth = 5 # 검색할 최대 깊이 find_patterns(soup.body) pattern_counts = Counter(patterns) repeating_patterns = [pattern for pattern, count in pattern_counts.items() if count >= threshold] return repeating_patterns def extract_elements(soup, pattern): elements = [] for element in soup.select(pattern.split('/', 1)[-1]): if get_element_signature(element) == pattern.split('/')[-1]: elements.append(element) return elements # 사용 예 url = "https://example.com/page-with-repeating-items" patterns = find_repeating_patterns(url) print("발견된 반복 패턴:") for pattern in patterns: print(f"패턴: {pattern}") elements = extract_elements(BeautifulSoup(requests.get(url).text, 'html.parser'), pattern) print(f"반복 횟수: {len(elements)}") print("첫 번째 요소 예시:") print(elements[0].prettify() if elements else "No elements found") print("\n" + "="*50 + "\n")
테스트 필요
/search/chatgpt-functionsreul-iyonghae-A4.JfuzkQvOQd1JCAvsu1Q
댓글