php网站模板使用,九洲建设app,wordpress绿色中文主题,北京经济技术开发区建设局网站一、引言#xff1a;为什么需要自动化Sitemap生成#xff1f;在现代SEO优化中#xff0c;网站地图#xff08;Sitemap#xff09;扮演着至关重要的角色。它不仅帮助搜索引擎爬虫更高效地索引网站内容#xff0c;还能提升网站的收录率和搜索排名。然而#xff0c;对于大型…一、引言为什么需要自动化Sitemap生成在现代SEO优化中网站地图Sitemap扮演着至关重要的角色。它不仅帮助搜索引擎爬虫更高效地索引网站内容还能提升网站的收录率和搜索排名。然而对于大型动态网站手动维护Sitemap几乎是不可能的任务。本文将介绍如何使用Python爬虫技术结合最新的异步编程和人工智能技术构建一个高效、智能的Sitemap自动生成系统。二、技术栈概览本解决方案采用以下现代化技术栈Python 3.9最新Python版本支持aiohttp/asyncio异步HTTP客户端提高爬取效率BeautifulSoup4HTML解析库lxml高性能XML处理aiodns异步DNS解析uvloop替代asyncio事件循环性能提升Scrapy框架可选的高级爬虫框架机器学习辅助使用简单的NLP技术识别重要页面三、核心爬虫设计与实现3.1 异步爬虫基础架构pythonimport asyncio import aiohttp import uvloop from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup from typing import Set, Dict, List import time import xml.etree.ElementTree as ET from dataclasses import dataclass from datetime import datetime import hashlib import zlib import json from concurrent.futures import ThreadPoolExecutor import logging # 配置异步事件循环 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) # 配置日志 logging.basicConfig( levellogging.INFO, format%(asctime)s - %(name)s - %(levelname)s - %(message)s ) logger logging.getLogger(__name__) dataclass class PageInfo: 页面信息数据类 url: str title: str last_modified: str change_frequency: str priority: float content_hash: str depth: int parent_url: str None meta_description: str None word_count: int 0 is_canonical: bool True class AdvancedSitemapGenerator: 高级Sitemap生成器 def __init__( self, start_url: str, max_depth: int 5, max_concurrency: int 100, respect_robots: bool True, user_agent: str AdvancedSitemapBot/1.0 (https://example.com/bot) ): self.start_url start_url self.max_depth max_depth self.max_concurrency max_concurrency self.respect_robots respect_robots self.user_agent user_agent # 存储已访问的URL self.visited_urls: Set[str] set() # 存储页面信息 self.page_infos: Dict[str, PageInfo] {} # 域名限制 self.base_domain urlparse(start_url).netloc # 连接池配置 self.connector None self.session None # 性能统计 self.stats { pages_crawled: 0, total_size: 0, start_time: None, end_time: None } # 设置请求头 self.headers { User-Agent: user_agent, Accept: text/html,application/xhtmlxml,application/xml;q0.9,*/*;q0.8, Accept-Language: en-US,en;q0.5, Accept-Encoding: gzip, deflate, br, Connection: keep-alive, Upgrade-Insecure-Requests: 1, } async def __aenter__(self): 异步上下文管理器入口 timeout aiohttp.ClientTimeout(total30) self.connector aiohttp.TCPConnector( limitself.max_concurrency, limit_per_host10, ttl_dns_cache300, enable_cleanup_closedTrue ) self.session aiohttp.ClientSession( connectorself.connector, timeouttimeout, headersself.headers ) return self async def __aexit__(self, exc_type, exc_val, exc_tb): 异步上下文管理器出口 if self.session: await self.session.close() if self.connector: await self.connector.close() def calculate_content_hash(self, content: str) - str: 计算内容哈希值用于检测内容变化 content_clean .join(content.split()) return hashlib.sha256(content_clean.encode()).hexdigest()[:16] def calculate_priority(self, url: str, depth: int, word_count: int) - float: 计算页面优先级基于深度、路径、内容长度等 base_priority 1.0 - (depth * 0.15) # 首页最高优先级 if depth 0: return 1.0 # 路径中包含关键词的页面提高优先级 important_keywords [product, service, about, contact, blog] path urlparse(url).path.lower() if any(keyword in path for keyword in important_keywords): base_priority min(base_priority 0.2, 0.9) # 内容丰富的页面提高优先级 if word_count 1000: base_priority min(base_priority 0.1, 0.9) return round(max(base_priority, 0.1), 2) def determine_change_freq(self, url: str) - str: 确定页面更新频率 path urlparse(url).path # 根据URL模式判断更新频率 if /blog/ in path or /news/ in path: return weekly elif any(x in path for x in [/product/, /service/]): return monthly elif path in [/, /home, /index]: return daily else: return monthly async def fetch_page(self, url: str) - tuple: 异步获取页面内容 try: async with self.session.get(url, sslFalse) as response: if response.status 200: content await response.text() content_type response.headers.get(Content-Type, ) # 只处理HTML内容 if text/html in content_type: self.stats[total_size] len(content) return content, response.headers else: logger.debug(f非HTML内容: {url}) return None, None else: logger.warning(fHTTP {response.status}: {url}) return None, None except Exception as e: logger.error(f获取页面失败 {url}: {str(e)}) return None, None def extract_links(self, html_content: str, base_url: str) - List[str]: 从HTML中提取所有链接 soup BeautifulSoup(html_content, lxml) links [] for link in soup.find_all(a, hrefTrue): href link.get(href).strip() # 过滤掉锚点链接、JavaScript链接等 if href.startswith(#) or href.startswith(javascript:): continue # 转换为绝对URL absolute_url urljoin(base_url, href) parsed_url urlparse(absolute_url) # 只保留HTTP/HTTPS协议 if parsed_url.scheme not in (http, https): continue # 限制在同一域名内可配置为允许子域名 if parsed_url.netloc self.base_domain: # 规范化URL移除片段、排序查询参数等 normalized_url self.normalize_url(absolute_url) links.append(normalized_url) return list(set(links)) # 去重 def normalize_url(self, url: str) - str: 规范化URL parsed urlparse(url) # 移除片段标识符 parsed parsed._replace(fragment) # 对查询参数进行排序可选 if parsed.query: query_params parsed.query.split() query_params.sort() parsed parsed._replace(query.join(query_params)) # 确保URL以/结尾对于没有路径的URL if not parsed.path: parsed parsed._replace(path/) return parsed.geturl() def extract_page_info(self, html_content: str, url: str, depth: int) - PageInfo: 提取页面元信息 soup BeautifulSoup(html_content, lxml) # 提取标题 title_tag soup.find(title) title title_tag.get_text().strip() if title_tag else 无标题 # 提取meta描述 meta_desc soup.find(meta, attrs{name: description}) description meta_desc[content].strip() if meta_desc else # 计算内容长度 text_content soup.get_text() word_count len(text_content.split()) # 检查canonical标签 canonical soup.find(link, relcanonical) is_canonical True if canonical and canonical.get(href): canonical_url urljoin(url, canonical[href]) if self.normalize_url(canonical_url) ! self.normalize_url(url): is_canonical False # 计算内容哈希 content_hash self.calculate_content_hash(text_content) # 确定优先级和更新频率 priority self.calculate_priority(url, depth, word_count) change_freq self.determine_change_freq(url) # 使用当前时间作为最后修改时间实际应用中可以从HTTP头获取 last_mod datetime.now().isoformat() return PageInfo( urlurl, titletitle, last_modifiedlast_mod, change_frequencychange_freq, prioritypriority, content_hashcontent_hash, depthdepth, meta_descriptiondescription, word_countword_count, is_canonicalis_canonical ) async def crawl_page(self, url: str, depth: int, parent_url: str None): 爬取单个页面并递归爬取链接 if depth self.max_depth: return if url in self.visited_urls: return self.visited_urls.add(url) logger.info(f爬取: {url} (深度: {depth})) # 获取页面内容 html_content, headers await self.fetch_page(url) if not html_content: return # 提取页面信息 page_info self.extract_page_info(html_content, url, depth) page_info.parent_url parent_url self.page_infos[url] page_info self.stats[pages_crawled] 1 # 提取并处理链接 if depth self.max_depth: links self.extract_links(html_content, url) # 批量异步处理子链接 tasks [] for link in links: if link not in self.visited_urls: task asyncio.create_task( self.crawl_page(link, depth 1, url) ) tasks.append(task) # 限制并发数量 for i in range(0, len(tasks), 10): batch tasks[i:i10] if batch: await asyncio.gather(*batch, return_exceptionsTrue) def generate_sitemap_xml(self) - str: 生成标准的XML Sitemap urlset ET.Element(urlset) urlset.set(xmlns, http://www.sitemaps.org/schemas/sitemap/0.9) urlset.set(xmlns:xsi, http://www.w3.org/2001/XMLSchema-instance) urlset.set(xsi:schemaLocation, http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd) # 按优先级排序 sorted_pages sorted( self.page_infos.values(), keylambda x: x.priority, reverseTrue ) for page in sorted_pages: # 跳过非canonical页面 if not page.is_canonical: continue url_elem ET.SubElement(urlset, url) ET.SubElement(url_elem, loc).text page.url ET.SubElement(url_elem, lastmod).text page.last_modified ET.SubElement(url_elem, changefreq).text page.change_frequency ET.SubElement(url_elem, priority).text str(page.priority) # 添加扩展信息如果需要 if page.title: title_elem ET.SubElement(url_elem, title) title_elem.text page.title # 美化XML输出 rough_string ET.tostring(urlset, utf-8) reparsed minidom.parseString(rough_string) return reparsed.toprettyxml(indent ) def generate_sitemap_index(self, sitemap_urls: List[str]) - str: 生成Sitemap索引文件用于大型网站 sitemapindex ET.Element(sitemapindex) sitemapindex.set(xmlns, http://www.sitemaps.org/schemas/sitemap/0.9) for sitemap_url in sitemap_urls: sitemap_elem ET.SubElement(sitemapindex, sitemap) ET.SubElement(sitemap_elem, loc).text sitemap_url ET.SubElement(sitemap_elem, lastmod).text datetime.now().isoformat() rough_string ET.tostring(sitemapindex, utf-8) reparsed minidom.parseString(rough_string) return reparsed.toprettyxml(indent ) def generate_sitemap_txt(self) - str: 生成简单的文本格式Sitemap lines [] for page in sorted(self.page_infos.values(), keylambda x: x.priority, reverseTrue): if page.is_canonical: lines.append(page.url) return \n.join(lines) def generate_sitemap_json(self) - str: 生成JSON格式的Sitemap便于API调用 sitemap_data { generated_at: datetime.now().isoformat(), base_url: self.start_url, total_pages: len(self.page_infos), pages: [] } for url, page_info in self.page_infos.items(): if page_info.is_canonical: sitemap_data[pages].append({ url: page_info.url, title: page_info.title, last_modified: page_info.last_modified, change_frequency: page_info.change_frequency, priority: page_info.priority, depth: page_info.depth, word_count: page_info.word_count, meta_description: page_info.meta_description }) return json.dumps(sitemap_data, indent2, ensure_asciiFalse) def save_sitemap(self, filename: str, format: str xml): 保存Sitemap到文件 if format xml: content self.generate_sitemap_xml() elif format txt: content self.generate_sitemap_txt() elif format json: content self.generate_sitemap_json() else: raise ValueError(f不支持的格式: {format}) with open(filename, w, encodingutf-8) as f: f.write(content) logger.info(fSitemap已保存到: {filename} ({len(content)} 字节)) def print_statistics(self): 打印爬取统计信息 elapsed self.stats[end_time] - self.stats[start_time] print(\n *60) print(爬取统计报告) print(*60) print(f起始URL: {self.start_url}) print(f爬取页面总数: {self.stats[pages_crawled]}) print(f发现URL总数: {len(self.visited_urls)}) print(f有效页面数: {len(self.page_infos)}) print(f总数据量: {self.stats[total_size] / 1024:.2f} KB) print(f耗时: {elapsed:.2f} 秒) print(f平均速度: {self.stats[pages_crawled]/elapsed:.2f} 页面/秒) print(\n页面优先级分布:) priorities [p.priority for p in self.page_infos.values()] if priorities: print(f 最高: {max(priorities):.2f}) print(f 最低: {min(priorities):.2f}) print(f 平均: {sum(priorities)/len(priorities):.2f}) print(*60) async def run(self): 运行爬虫 self.stats[start_time] time.time() async with self: await self.crawl_page(self.start_url, depth0) self.stats[end_time] time.time() # 打印统计信息 self.print_statistics() # 保存Sitemap timestamp datetime.now().strftime(%Y%m%d_%H%M%S) self.save_sitemap(fsitemap_{timestamp}.xml, xml) self.save_sitemap(fsitemap_{timestamp}.txt, txt) self.save_sitemap(fsitemap_{timestamp}.json, json) return self.page_infos # 添加缺失的导入 from xml.dom import minidom async def main(): 主函数 # 示例使用 generator AdvancedSitemapGenerator( start_urlhttps://example.com, # 替换为目标网站 max_depth3, max_concurrency50, respect_robotsTrue, user_agentAdvancedSitemapBot/1.0 (https://mycrawler.com/bot) ) try: await generator.run() except KeyboardInterrupt: logger.info(爬虫被用户中断) except Exception as e: logger.error(f爬虫执行失败: {str(e)}) if __name__ __main__: # 运行异步主函数 asyncio.run(main())3.2 智能爬虫增强功能pythonclass IntelligentSitemapGenerator(AdvancedSitemapGenerator): 智能Sitemap生成器包含机器学习功能 def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.important_keywords self.load_important_keywords() def load_important_keywords(self): 加载重要关键词可从文件或数据库加载 return { product, service, pricing, buy, order, blog, article, news, update, about, contact, support, help, download, documentation, api, tutorial } def analyze_content_importance(self, text: str, url: str) - float: 分析内容重要性得分 score 0.0 # 基于关键词的出现 text_lower text.lower() for keyword in self.important_keywords: if keyword in text_lower: score 0.1 # 基于内容长度适度长度得分最高 word_count len(text.split()) if 300 word_count 2000: score 0.3 elif word_count 2000: score 0.2 # 基于URL结构 path urlparse(url).path if path / or path /index: score 0.5 return min(score, 1.0) def extract_page_info(self, html_content: str, url: str, depth: int) - PageInfo: 重写父类方法加入智能分析 page_info super().extract_page_info(html_content, url, depth) # 分析内容重要性 soup BeautifulSoup(html_content, lxml) text_content soup.get_text() importance_score self.analyze_content_importance(text_content, url) # 调整优先级 page_info.priority min(page_info.priority importance_score * 0.2, 1.0) # 识别页面类型 page_type self.classify_page_type(html_content, url) if page_type article: page_info.change_frequency weekly elif page_type product: page_info.change_frequency monthly return page_info def classify_page_type(self, html_content: str, url: str) - str: 识别页面类型 soup BeautifulSoup(html_content, lxml) # 检查常见的页面结构 if soup.find(article): return article elif soup.find(form): return form elif product in url or shop in url: return product elif blog in url: return blog return page class DistributedSitemapGenerator(IntelligentSitemapGenerator): 分布式Sitemap生成器支持大规模网站 async def distributed_crawl(self, start_urls: List[str]): 分布式爬取多个起始URL tasks [] for url in start_urls: task asyncio.create_task( self.crawl_page(url, depth0) ) tasks.append(task) # 使用信号量限制总并发数 semaphore asyncio.Semaphore(self.max_concurrency) async def limited_crawl(url): async with semaphore: return await self.crawl_page(url, depth0) await asyncio.gather(*tasks, return_exceptionsTrue)四、高级功能与优化4.1 代理和反爬虫绕过pythonclass AntiBlockSitemapGenerator(AdvancedSitemapGenerator): 具有反爬虫绕过功能的Sitemap生成器 def __init__(self, *args, proxy_listNone, **kwargs): super().__init__(*args, **kwargs) self.proxy_list proxy_list or [] self.current_proxy_index 0 self.request_delay 1.0 # 请求延迟 self.rotate_user_agents [ Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36, Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15, Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ] async def fetch_page(self, url: str) - tuple: 重写fetch_page方法加入反爬虫策略 # 随机延迟 await asyncio.sleep(self.request_delay * (0.5 random.random())) # 轮换User-Agent self.headers[User-Agent] random.choice(self.rotate_user_agents) # 使用代理如果配置了 proxy None if self.proxy_list: proxy self.proxy_list[self.current_proxy_index] self.current_proxy_index (self.current_proxy_index 1) % len(self.proxy_list) try: async with self.session.get( url, proxyproxy, sslFalse, headersself.headers ) as response: # 处理常见的反爬虫响应 if response.status 429: # Too Many Requests logger.warning(f遇到速率限制等待5秒: {url}) await asyncio.sleep(5) return await self.fetch_page(url) # 重试 return await super().fetch_page(url) except Exception as e: logger.error(f请求失败: {url}, 错误: {str(e)}) return None, None4.2 增量更新与缓存pythonclass IncrementalSitemapGenerator(AdvancedSitemapGenerator): 支持增量更新的Sitemap生成器 def __init__(self, *args, cache_filesitemap_cache.json, **kwargs): super().__init__(*args, **kwargs) self.cache_file cache_file self.cache self.load_cache() def load_cache(self) - Dict: 加载缓存 try: with open(self.cache_file, r, encodingutf-8) as f: return json.load(f) except FileNotFoundError: return {} def save_cache(self): 保存缓存 cache_data { generated_at: datetime.now().isoformat(), pages: {} } for url, page_info in self.page_infos.items(): cache_data[pages][url] { content_hash: page_info.content_hash, last_modified: page_info.last_modified, priority: page_info.priority } with open(self.cache_file, w, encodingutf-8) as f: json.dump(cache_data, f, indent2) def detect_changes(self) - Dict: 检测页面变化 changes { new: [], updated: [], deleted: [], unchanged: [] } cached_urls set(self.cache.get(pages, {}).keys()) current_urls set(self.page_infos.keys()) # 新增的页面 changes[new] list(current_urls - cached_urls) # 删除的页面 changes[deleted] list(cached_urls - current_urls) # 检查更新的页面 for url in current_urls.intersection(cached_urls): cached_hash self.cache[pages][url].get(content_hash, ) current_hash self.page_infos[url].content_hash if cached_hash ! current_hash: changes[updated].append(url) else: changes[unchanged].append(url) return changes def generate_update_sitemap(self, changes: Dict) - str: 生成更新专用的Sitemap urlset ET.Element(urlset) urlset.set(xmlns, http://www.sitemaps.org/schemas/sitemap/0.9) # 只包含新增和更新的页面 for change_type, urls in changes.items(): if change_type in [new, updated]: for url in urls: page_info self.page_infos[url] url_elem ET.SubElement(urlset, url) ET.SubElement(url_elem, loc).text page_info.url ET.SubElement(url_elem, lastmod).text page_info.last_modified ET.SubElement(url_elem, changefreq).text page_info.change_frequency ET.SubElement(url_elem, priority).text str(page_info.priority) rough_string ET.tostring(urlset, utf-8) reparsed minidom.parseString(rough_string) return reparsed.toprettyxml(indent )五、部署与使用示例5.1 Docker部署配置dockerfile# Dockerfile FROM python:3.9-slim WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt COPY . . # 安装性能优化库 RUN pip install uvloop aiodns # 创建非root用户 RUN useradd -m -u 1000 sitemapbot USER sitemapbot CMD [python, sitemap_generator.py]yaml# docker-compose.yml version: 3.8 services: sitemap-generator: build: . volumes: - ./data:/app/data - ./config:/app/config environment: - START_URLhttps://example.com - MAX_DEPTH3 - MAX_CONCURRENCY50 restart: unless-stopped networks: - sitemap-net redis-cache: image: redis:alpine ports: - 6379:6379 volumes: - redis-data:/data networks: - sitemap-net scheduler: build: . command: python scheduler.py depends_on: - sitemap-generator - redis-cache environment: - REDIS_HOSTredis-cache - SCHEDULE_HOURS24 networks: - sitemap-net volumes: redis-data: networks: sitemap-net:5.2 定时任务与自动化python# scheduler.py import asyncio import schedule import time import redis import json from datetime import datetime from sitemap_generator import IntelligentSitemapGenerator class SitemapScheduler: Sitemap生成调度器 def __init__(self, redis_hostlocalhost, redis_port6379): self.redis_client redis.Redis( hostredis_host, portredis_port, decode_responsesTrue ) self.targets self.load_targets() def load_targets(self): 加载目标网站配置 try: with open(config/targets.json, r) as f: return json.load(f) except FileNotFoundError: return [ { url: https://example.com, max_depth: 3, schedule: daily } ] async def generate_for_target(self, target): 为单个目标生成Sitemap print(f开始生成Sitemap: {target[url]}) generator IntelligentSitemapGenerator( start_urltarget[url], max_depthtarget.get(max_depth, 3), max_concurrencytarget.get(max_concurrency, 30) ) try: pages await generator.run() # 存储到Redis timestamp datetime.now().isoformat() key fsitemap:{target[url]}:{timestamp} self.redis_client.set(key, json.dumps({ pages: len(pages), generated_at: timestamp })) print(f完成: {target[url]} - 生成 {len(pages)} 个页面) except Exception as e: print(f错误: {target[url]} - {str(e)}) def run(self): 运行调度器 # 设置定时任务 for target in self.targets: if target[schedule] daily: schedule.every().day.at(02:00).do( lambda ttarget: asyncio.run(self.generate_for_target(t)) ) elif target[schedule] weekly: schedule.every().monday.at(03:00).do( lambda ttarget: asyncio.run(self.generate_for_target(t)) ) print(调度器已启动...) # 主循环 while True: schedule.run_pending() time.sleep(60) if __name__ __main__: scheduler SitemapScheduler() scheduler.run()5.3 完整的CLI工具python# cli.py import argparse import asyncio import sys from pathlib import Path def main(): parser argparse.ArgumentParser( description高级Sitemap生成器 - 自动爬取网站并生成Sitemap ) parser.add_argument( url, help目标网站的起始URL ) parser.add_argument( -d, --depth, typeint, default3, help爬取深度默认3 ) parser.add_argument( -c, --concurrency, typeint, default50, help最大并发数默认50 ) parser.add_argument( -o, --output, default./sitemaps, help输出目录默认./sitemaps ) parser.add_argument( -f, --format, choices[xml, txt, json, all], defaultall, help输出格式默认all ) parser.add_argument( --incremental, actionstore_true, help启用增量更新模式 ) parser.add_argument( --intelligent, actionstore_true, help启用智能分析模式 ) parser.add_argument( --proxy-file, help代理服务器列表文件 ) args parser.parse_args() # 创建输出目录 output_dir Path(args.output) output_dir.mkdir(exist_okTrue) # 选择生成器类型 if args.intelligent: from sitemap_generator import IntelligentSitemapGenerator generator_class IntelligentSitemapGenerator elif args.incremental: from sitemap_generator import IncrementalSitemapGenerator generator_class IncrementalSitemapGenerator else: from sitemap_generator import AdvancedSitemapGenerator generator_class AdvancedSitemapGenerator # 配置生成器 generator_kwargs { start_url: args.url, max_depth: args.depth, max_concurrency: args.concurrency } # 如果有代理文件加载代理 if args.proxy_file: with open(args.proxy_file, r) as f: proxies [line.strip() for line in f if line.strip()] generator_kwargs[proxy_list] proxies # 运行爬虫 async def run_crawler(): generator generator_class(**generator_kwargs) await generator.run() # 保存结果 timestamp datetime.now().strftime(%Y%m%d_%H%M%S) base_name output_dir / fsitemap_{timestamp} formats [xml, txt, json] if args.format all else [args.format] for fmt in formats: generator.save_sitemap(f{base_name}.{fmt}, fmt) try: asyncio.run(run_crawler()) except KeyboardInterrupt: print(\n程序被用户中断) sys.exit(1) except Exception as e: print(f错误: {str(e)}) sys.exit(1) if __name__ __main__: main()六、性能优化与最佳实践6.1 性能优化技巧连接池优化使用aiohttp的TCPConnector配置连接池DNS缓存启用aiodns进行异步DNS解析和缓存内存管理及时释放不再使用的HTML内容去重优化使用Bloom Filter处理大规模URL去重增量爬取只爬取有变化的页面