鸡仔说:上一节我们通过代码演示了 LRU 的算法实现,这次我们使用 python 的内置 lru_cache 模块实现一个基于知乎热榜的通用爬虫,废话少说,黑喂狗
首先我们了解一下内置 lru 的基本用法,代码如下所示
from functools import lru_cacheimport random@lru_cache(maxsize=3)def get_cloth(event): event_cloth_map = { "见鸡仔": f"背心+裤衩", "求职": f"西服+衬衣", "跳舞": f"卫衣+休闲裤", "工作": f"外套+T恤+休闲裤", } if isinstance(event, int): if event < 4: return "羽绒服" elif 4 <= event < 6: return "外套" elif 6 <= event < 8: return "毛衣" elif 8 <= event < 10: return "大衣" elif 10 <= event < 12: return "马夹" elif 12 <= event < 14: return "卫衣" elif 14 <= event < 16: return "卫衣" elif 16 < event < 20: return "长袖" else: return "短袖" elif isinstance(event, str) and event in event_cloth_map: return event_cloth_map[event] return "裸奔"if __name__ == '__main__': random.seed(444) for _ in range(10): temperature = random.randint(0, 30) print(f"当前温度>>>{temperature},穿{get_cloth(temperature)}") e = "见鸡仔" print(f"如果{e},就穿{get_cloth(e)}") e = "求职" print(f"如果{e},就穿{get_cloth(e)}") print(get_cloth.cache_info())
当前温度>>>9,穿大衣当前温度>>>9,穿大衣当前温度>>>0,穿羽绒服当前温度>>>9,穿大衣当前温度>>>15,穿卫衣当前温度>>>8,穿大衣当前温度>>>27,穿短袖当前温度>>>9,穿大衣当前温度>>>23,穿短袖当前温度>>>16,穿短袖如果见鸡仔,就穿背心+裤衩如果求职,就穿西服+衬衣CacheInfo(hits=2, misses=10, maxsize=3, currsize=3)





├── parser.py # 解析器├── run.py # 启动文件├── saver.py # 存储器└── spider.py # 爬虫
class BaseSpider: passclass ZhihuSpider(BaseSpider): passclass V2exSpider(BaseSpider): pass
class BaseParser: passclass JsonFormatParser(BaseParser): pass
class BaseSaver: passclass MongoSaver(BaseSaver): passclass MysqlSaver(BaseSaver): passclass ExcelSaver(BaseSaver): pass

from abc import (ABC, abstractmethod)import requestsclass BaseSpider(ABC): _headers = { 'accept': 'application/json, text/plain, */*', "accept-encoding": "gzip, deflate, br", "cache-control": "no-cache", 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Ch' 'rome/84.0.4147.135 Safari/537.36', 'content-type': 'application/json;charset=UTF-8', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'cors', 'sec-fetch-dest': 'empty', "pragma": "no-cache", } @abstractmethod def spider(self): passclass ZhihuSpider(BaseSpider): url = "https://www.zhihu.com/api/v3/feed/topstory/hot-lists/total?limit=50&desktop=true" def spider(self): resp = requests.get(self.url, headers=self._headers, timeout=30) return resp.json()class V2exSpider(BaseSpider): url = "https://www.v2ex.com/api/topics/hot.json" def spider(self): resp = requests.get(self.url, headers=self._headers, timeout=30) return resp.json()class BilibiliSpider(BaseSpider): url = "https://api.bilibili.com/x/web-interface/ranking/v2?rid=0&type=all" def spider(self): resp = requests.get(self.url, headers=self._headers, timeout=30) return resp.json()
from attr import attrs, attrib, validators@attrsclass News: _platform = attrib(type=str, validator=validators.instance_of(str)) title = attrib(type=str, validator=validators.instance_of(str)) url = attrib(type=str, validator=validators.instance_of(str)) reply_cnt = attrib(type=int, validator=validators.instance_of(int), default=-1) summary = attrib(type=str, validator=validators.instance_of(str), default="") publish_time = attrib(type=str, validator=validators.instance_of(str), default=""
from loguru import loggerimport inspectfrom .obj import Newsfrom glom import glomfrom datetime import datetimeclass BaseParser: passclass JsonFormatParser(BaseParser): def __init__(self): pass def __get_all_method(self): instance_members = inspect.getmembers(self, predicate=inspect.ismethod) return map(lambda x: x[0], filter(lambda x: not x[0].startswith("_"), instance_members)) def parse(self, topic, result): # topic 不可以命名成 parse if topic in ["parse"]: raise ValueError(f"topic 不可以命名成 parse") if topic not in self.__get_all_method(): raise ValueError(f"没有找到可用的解析函数:topic={topic}\tall_method_name={all_method_name}") return getattr(self, topic)(result) def zhihu(self, result): all_result = [] for per_result in result["data"]: try: n = News( platform="zhihu", title=glom(per_result, "target.title"), url=glom(per_result, "target.url"), reply_cnt=int(glom(per_result, "target.answer_count")), summary=(glom(per_result, "target.excerpt")), ) all_result.append(n) except Exception as e: logger.error(f"解析 知乎 数据错误:") logger.error(f"per_result={per_result}") logger.exception(e) return all_result def v2ex(self, result): all_result = [] for per_result in result: try: n = News( platform="v2ex", title=per_result.get("title"), url=per_result.get("url"), publish_time=str(datetime.fromtimestamp(per_result.get("created"))), summary=per_result.get("content"), reply_cnt=int(per_result.get("replies", -1)) ) all_result.append(n) except Exception as e: logger.error(f"解析 v2ex 数据错误:") logger.error(f"per_result={per_result}") logger.exception(e) return all_result def bilibili(self, result): all_result = [] for per_result in glom(result, "data.list"): try: n = News( platform="bilibili", title=per_result.get("title"), url=f"https://www.bilibili.com/video/{per_result.get('bvid')}", summary=per_result.get("desc"), reply_cnt=int(glom(per_result, "stat.reply", default=-1)), publish_time=str(datetime.fromtimestamp(per_result.get("ctime"))) ) all_result.append(n) except Exception as e: logger.error(f"解析 b站 数据错误:") logger.error(f"per_result={per_result}") logger.exception(e) return all_result
from loguru import loggerfrom abc import ABC, abstractmethodfrom cattr import unstructureclass BaseSaver(ABC): # 解析打包的对象,本例子中是 News 对象 def _unpack_struct(self, data_obj): return unstructure(data_obj) @abstractmethod def save(self, objs): passclass MongoSaver(BaseSaver): def save(self, objs): unpack_data = list(map(self._unpack_struct, objs)) logger.info(f"save all data to mongo {unpack_data}")class MysqlSaver(BaseSaver): def save(self, objs): unpack_data = list(map(self._unpack_struct, objs)) logger.info(f"save all data to mysql {unpack_data}")class ExcelSaver(BaseSaver): def save(self, objs): unpack_data = list(map(self._unpack_struct, objs)) logger.info(f"save all data to excel {unpack_data}")
import syssys.path.append("..")from loguru import loggerfrom lru_lab.spider import (ZhihuSpider, V2exSpider, BilibiliSpider)from lru_lab.parser import JsonFormatParserfrom lru_lab.saver import (ExcelSaver, MongoSaver, MysqlSaver)from cattr import unstructurefrom functools import lru_cachefrom flask import (Flask, request)spider_map = { "bilibili": BilibiliSpider().spider, "v2ex": V2exSpider().spider, "zhihu": ZhihuSpider().spider,}save_map = { "excel": ExcelSaver().save, "mysql": MysqlSaver().save, "mongo": MongoSaver().save,}@lru_cache(maxsize=3)def grab_hot(topic, save_method='excel'): spider = spider_map.get(topic) save = save_map.get(save_method) spider_result = spider() parse_result = JsonFormatParser().parse(topic, spider_result) save(parse_result) return list(map(unstructure, parse_result))app = Flask(__name__)# 让中文正常显示app.config["JSON_AS_ASCII"] = False@app.route("/", methods=["post"])def index(): res = request.json topic = res.get("topic") if not topic or topic not in spider_map: logger.warning(f"topic error:{topic}") return {"status": -1, "message": "未知主题"} save_method = res.get("save") or "excel" result = grab_hot(topic, save_method) logger.info(grab_hot.cache_info()) return {"status": 1, "results": result}if __name__ == '__main__': app.run(host="0.0.0.0", debug=True)
>>>python run.py * Serving Flask app "run" (lazy loading) * Environment: production WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead. * Debug mode: on * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit) * Restarting with stat * Debugger is active! * Debugger PIN: 317-909-129
curl -X POST -H 'content-type: application/json' -d '{"topic":"zhihu"}' http://127.0.0.1:5000
* Debugger is active!* Debugger PIN: 317-909-1292021-03-01 09:09:24.212 | INFO | lru_lab.saver:save:39 - save all data to excel [{'_platform': 'zhihu', 'title': '如何看待山东拉面哥十几...2021-03-01 09:09:24.258 | INFO | __main__:index:67 - CacheInfo(hits=0, misses=1, maxsize=3, currsize=1)127.0.0.1 - - [01/Mar/2021 09:09:24] "POST / HTTP/1.1" 200 -
2021-03-01 09:09:24.258 | INFO | __main__:index:67 - CacheInfo(hits=0, misses=1, maxsize=3, currsize=1)127.0.0.1 - - [01/Mar/2021 09:09:24] "POST / HTTP/1.1" 200 -2021-03-01 09:12:14.628 | INFO | __main__:index:67 - CacheInfo(hits=1, misses=1, maxsize=3, currsize=1)127.0.0.1 - - [01/Mar/2021 09:12:14] "POST / HTTP/1.1" 200 -2021-03-01 09:12:21.598 | INFO | __main__:index:67 - CacheInfo(hits=2, misses=1, maxsize=3, currsize=1)127.0.0.1 - - [01/Mar/2021 09:12:21] "POST / HTTP/1.1" 200 -
class LruCache: def __init__(self, maxsize=3, timeout=2): self.maxsize = maxsize self.timeout = timeout self.last_time = int(time.time()) def __call__(self, func): func = lru_cache(maxsize=self.maxsize)(func) def wrapper(*args, **kwargs): if int(time.time()) - self.last_time > self.timeout: logger.debug(func.cache_info()) func.cache_clear() self.last_time = int(time.time()) return func(*args, **kwargs) return wrapper
@LruCache(maxsize=3, timeout=2)def grab_hot(topic, save_method='excel'):
def spider(self): logger.debug("开始知乎爬虫...") resp = requests.get(self.url, headers=self._headers, timeout=30) return resp.json()
curl -X POST -H 'content-type: application/json' -d '{"topic":"zhihu"}' http://127.0.0.1:5000curl -X POST -H 'content-type: application/json' -d '{"topic":"zhihu"}' http://127.0.0.1:5000curl -X POST -H 'content-type: application/json' -d '{"topic":"zhihu"}' http://127.0.0.1:5000curl -X POST -H 'content-type: application/json' -d '{"topic":"zhihu"}' http://127.0.0.1:5000curl -X POST -H 'content-type: application/json' -d '{"topic":"zhihu"}' http://127.0.0.1:5000...
2021-03-02 20:43:41.819 | DEBUG | __main__:wrapper:50 - CacheInfo(hits=0, misses=0, maxsize=3, currsize=0)2021-03-02 20:43:41.820 | DEBUG | lru_lab.spider:spider:35 - 开始知乎爬虫...2021-03-02 20:43:42.048 | INFO | lru_lab.saver:save:39 - save all data to excel [{'_platform': 'zhihu', 'title': '如何看待日本...127.0.0.1 - - [02/Mar/2021 20:43:42] "POST / HTTP/1.1" 200 -127.0.0.1 - - [02/Mar/2021 20:43:42] "POST / HTTP/1.1" 200 -127.0.0.1 - - [02/Mar/2021 20:43:43] "POST / HTTP/1.1" 200 -2021-03-02 20:43:44.588 | DEBUG | __main__:wrapper:50 - CacheInfo(hits=2, misses=1, maxsize=3, currsize=1)2021-03-02 20:43:44.588 | DEBUG | lru_lab.spider:spider:35 - 开始知乎爬虫...2021-03-02 20:43:44.764 | INFO | lru_lab.saver:save:39 - save all data to excel [{'_platform': 'zhihu', 'title': '如何看待日本...127.0.0.1 - - [02/Mar/2021 20:43:44] "POST / HTTP/1.1" 200 -127.0.0.1 - - [02/Mar/2021 20:43:45] "POST / HTTP/1.1" 200 -127.0.0.1 - - [02/Mar/2021 20:43:45] "POST / HTTP/1.1" 200 -127.0.0.1 - - [02/Mar/2021 20:43:46] "POST / HTTP/1.1" 200 -127.0.0.1 - - [02/Mar/2021 20:43:46] "POST / HTTP/1.1" 200 -2021-03-02 20:43:47.389 | DEBUG | __main__:wrapper:50 - CacheInfo(hits=4, misses=1, maxsize=3, currsize=1)2021-03-02 20:43:47.389 | DEBUG | lru_lab.spider:spider:35 - 开始知乎爬虫...2021-03-02 20:43:47.568 | INFO | lru_lab.saver:save:39 - save all data to excel [{'_platform': 'zhihu', 'title': '如何看待日本...127.0.0.1 - - [02/Mar/2021 20:43:47] "POST / HTTP/1.1" 200 -127.0.0.1 - - [02/Mar/2021 20:43:47] "POST / HTTP/1.1" 200 -127.0.0.1 - - [02/Mar/2021 20:43:48] "POST / HTTP/1.1" 200 -2021-03-02 20:44:18.904 | DEBUG | __main__:wrapper:50 - CacheInfo(hits=2, misses=1, maxsize=3, currsize=1)2021-03-02 20:44:18.904 | DEBUG | lru_lab.spider:spider:35 - 开始知乎爬虫...


好看的人都点了在看
文章转载自鸡仔说,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。






