from bs4 import BeautifulSoup
from collections import Counter
def find_repeating_pattern(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
def get_tag_signature(tag):
attrs = ' '.join(f"{k}='{v}'" for k, v in sorted(tag.attrs.items()))
return f"{tag.name}[{attrs}]"
def analyze_structure(tag):
signature = get_tag_signature(tag)
children = tuple(analyze_structure(child) for child in tag.children if isinstance(child, BeautifulSoup.Tag))
return (signature, children)
structures = [analyze_structure(tag) for tag in soup.find_all()]
pattern_counter = Counter(structures)
most_common_pattern, count = pattern_counter.most_common(1)[0]
if count > 1:
return most_common_pattern[0]
else:
return None
# 사용 예
html_content = """
<p class="img">
<p>
<div>
</div>
</p>
</p>
<p class="img">
<p>
<div>
</div>
</p>
</p>
"""
repeating_pattern = find_repeating_pattern(html_content)
print(f"가장 많이 반복되는 패턴의 상위 태그: {repeating_pattern}")
import requests
from bs4 import BeautifulSoup
from collections import Counter
def find_repeating_patterns(url, threshold=3):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
def get_element_signature(element):
attrs = ''.join(f" {k}='{v}'" for k, v in sorted(element.attrs.items()))
return f"{element.name}{attrs}"
def find_patterns(element, depth=0, path=''):
signature = get_element_signature(element)
current_path = f"{path}/{signature}"
patterns.append(current_path)
if depth < max_depth:
for child in element.children:
if child.name:
find_patterns(child, depth + 1, current_path)
patterns = []
max_depth = 5 # 검색할 최대 깊이
find_patterns(soup.body)
pattern_counts = Counter(patterns)
repeating_patterns = [pattern for pattern, count in pattern_counts.items() if count >= threshold]
return repeating_patterns
def extract_elements(soup, pattern):
elements = []
for element in soup.select(pattern.split('/', 1)[-1]):
if get_element_signature(element) == pattern.split('/')[-1]:
elements.append(element)
return elements
# 사용 예
url = "https://example.com/page-with-repeating-items"
patterns = find_repeating_patterns(url)
print("발견된 반복 패턴:")
for pattern in patterns:
print(f"패턴: {pattern}")
elements = extract_elements(BeautifulSoup(requests.get(url).text, 'html.parser'), pattern)
print(f"반복 횟수: {len(elements)}")
print("첫 번째 요소 예시:")
print(elements[0].prettify() if elements else "No elements found")
print("\n" + "="*50 + "\n")
테스트 필요
/search/chatgpt-functionsreul-iyonghae-A4.JfuzkQvOQd1JCAvsu1Q
댓글