首页
>
精通Python网络编程 > 爬虫代码测试文章
- 彻底掌握Python中的网络编程
- 已经是最后一篇了
# __author__ = 薯条老师 # __date__ = 2020-12-16 import re class HTMLParser: class Label: def __init__(self, label, text, **kwargs): self.__label = label self.__text = text self.__build_attrs(**kwargs) def __build_attrs(self, **kwargs): for name,value in kwargs.items(): self.__dict__[name] = value def get_text(self): return re.sub("<.*?>|</.*?>", "", self.__text) def __getitem__(self, name): if name == "text" and name not in self.__dict__: self.__dict__[name] = self.get_text() return self.__dict__.get(name) def __init__(self, html): self.__html = html @property def html(self): return self.__html def __build_pattern(self, label, **kwargs): pattern = "<{}.*?".format(label) for attr, value in kwargs.items(): if attr == "_class": attr = "class" pattern+='{}="{}"'.format(attr, value) pattern += ".*?>(.*?)</{}>".format(label) return pattern def find(self, label, **kwargs): pattern = self.__build_pattern(label, **kwargs) instances = [] for _ in re.findall(pattern, self.__html): instances.append(HTMLParser.Label(label, _, **kwargs)) return instances if __name__ == "__main__": html = "<html><body><p class='article'>在线Python教程 " \ "<a href='www.chipscoco.com'>Python零基础入门指南</a></p>" \ "<div><p><a href=\"www.chipscoco.com\">与薯条老师一起学Python</a></p>" \ "</div></body></html>" html_parser = HTMLParser(html) labels = html_parser.find("p") for label in labels: print(label.get_text()) labels = html_parser.find("a", href="www.chipscoco.com") for label in labels: print(label["href"], label["text"])
- 彻底掌握Python中的网络编程
- 已经是最后一篇了