웹페이지 pattern catch (python)

from bs4 import BeautifulSoup
from collections import Counter

def find_repeating_pattern(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    def get_tag_signature(tag):
        attrs = ' '.join(f"{k}='{v}'" for k, v in sorted(tag.attrs.items()))
        return f"{tag.name}[{attrs}]"
    
    def analyze_structure(tag):
        signature = get_tag_signature(tag)
        children = tuple(analyze_structure(child) for child in tag.children if isinstance(child, BeautifulSoup.Tag))
        return (signature, children)
    
    structures = [analyze_structure(tag) for tag in soup.find_all()]
    
    pattern_counter = Counter(structures)
    most_common_pattern, count = pattern_counter.most_common(1)[0]
    
    if count > 1:
        return most_common_pattern[0]
    else:
        return None

# 사용 예
html_content = """
<p class="img"> 
   <p> 
      <div>
      </div>
   </p>
</p>
<p class="img"> 
   <p> 
      <div>
      </div>
   </p>
</p>
"""

repeating_pattern = find_repeating_pattern(html_content)
print(f"가장 많이 반복되는 패턴의 상위 태그: {repeating_pattern}")

import requests
from bs4 import BeautifulSoup
from collections import Counter

def find_repeating_patterns(url, threshold=3):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    def get_element_signature(element):
        attrs = ''.join(f" {k}='{v}'" for k, v in sorted(element.attrs.items()))
        return f"{element.name}{attrs}"

    def find_patterns(element, depth=0, path=''):
        signature = get_element_signature(element)
        current_path = f"{path}/{signature}"
        patterns.append(current_path)

        if depth < max_depth:
            for child in element.children:
                if child.name:
                    find_patterns(child, depth + 1, current_path)

    patterns = []
    max_depth = 5  # 검색할 최대 깊이
    find_patterns(soup.body)

    pattern_counts = Counter(patterns)
    repeating_patterns = [pattern for pattern, count in pattern_counts.items() if count >= threshold]

    return repeating_patterns

def extract_elements(soup, pattern):
    elements = []
    for element in soup.select(pattern.split('/', 1)[-1]):
        if get_element_signature(element) == pattern.split('/')[-1]:
            elements.append(element)
    return elements

# 사용 예
url = "https://example.com/page-with-repeating-items"
patterns = find_repeating_patterns(url)

print("발견된 반복 패턴:")
for pattern in patterns:
    print(f"패턴: {pattern}")
    elements = extract_elements(BeautifulSoup(requests.get(url).text, 'html.parser'), pattern)
    print(f"반복 횟수: {len(elements)}")
    print("첫 번째 요소 예시:")
    print(elements[0].prettify() if elements else "No elements found")
    print("\n" + "="*50 + "\n")

테스트 필요

/search/chatgpt-functionsreul-iyonghae-A4.JfuzkQvOQd1JCAvsu1Q

티스토리툴바