MongoDB封装,实现数据存储
'''存储文件:MongoClass.py'''import pymongoimport tracebackclass MongoOpt:
__client = '' __db = '' __col = '' def __init__(self): try: self.__client = pymongo.MongoClient("mongodb://127.0.0.1:27017/") self.__db = self.__client['geng'] self.__col = self.__db['gerenqingkuang'] except: traceback.print_exc()def add_collection(self, info):
try: self.__col.insert_one(info) except: traceback.print_exc()def get_all_collections(self):
try:value = []
text = '' for r in self.__col.find({},{"_id": 0, "相片": 0}): value.extend(r.values()) for v in value: text += str(v) with open("./conf/233.txt", 'w', encoding='utf-8') as txt: txt.write(text) return text except: traceback.print_exc()if __name__ == '__main__':
mo = MongoOpt() # mo.add_collection({'name':"geng", 'sex':'m'}) res = mo.get_all_collections() print( res)爬虫封装,实现数据爬取'''文件目录:SpiderClass.py'''import requestsfrom bs4 import BeautifulSoup as bsimport reclass SpiderOpt:
def __init__(self, method, url, **kwargs): self.__response = requests.request(method, url, **kwargs)def get_response(self):
self.__res = self.__response.text self.__bs = bs(self.__res, 'html.parser')def get_person_details(self):
details = self.__bs.find_all(class_="newshow2") keys = [] values = [] for detail in details: keys.extend(detail.find_all(class_='s1')) values.extend(detail.find_all(class_='txt'))# re去除html标签
re_html = re.compile('<[^<]+?>') for i in range(len(keys)): keys[i] = re_html.sub('', str(keys[i])).split(':')[0] for j in range(len(values)): values[j] = re_html.sub('', str(values[j])).split()[0] return dict(zip(keys, values))def get_family_love(self):
details = self.__bs.find_all(class_="newshow3") keys = [] values = [] for detail in details: keys.extend(detail.find_all(class_='s1')) values.extend(detail.find_all(class_='txt'))# re去除html标签
re_html = re.compile('<[^<]+?>') for i in range(len(keys)): keys[i] = re_html.sub('', str(keys[i])).split(':')[0] for j in range(len(values)): values[j] = re_html.sub('', str(values[j])).split()[0] return dict(zip(keys, values))def get_love_request(self):
table = self.__bs.find(class_="tableB") tds = table.find_all('td') keys = [] values = [] # re去除html标签 re_html = re.compile('<[^<]+?>') for i in range(len(tds)): s = re_html.sub(www.cjyl1yule.com'', str(tds[i])).split(':')[0] if i % 2 == 0: keys.append(s) else: values.append(s) return dict(zip(keys, values))def get_jpg_src(self):
photo = self.__bs.find(class_=www.oushengyule.com"love_photo") jpg_src = photo.find('a') return jpg_src['href']词云封装,实现词图生成'''文件目录:CloudClass.py'''from wordcloud import WordCloudfrom PIL import Imageimport numpy as npimport matplotlib.pyplot as pltimport jiebaclass Cloud:
def __init__(self): passdef get_cloud(self, text):
# path = r'./' font = r'./conf/FZSTK.TTF' # 字体path text = (open('./conf/233.txt', 'r'www.zheshengyuLe.com, encoding='utf-8')).read() # 如果是中文的话encoding换成utf8 cut = jieba.cut(text) # 分词 string = ' '.join(cut) print(len(string)) img = Image.open('./conf/timg.jpg') # 打开图片 img_array = np.array(img) # 将图片装换为数组 stopwords = {'要求': 0, '汉族': 0, '厘米': 0, '公斤': 0, '父亲': 0, '母亲': 0, '父母': 0, '随意': 0, '其他': 0, '退休': 0, '对方': 0, '中文': 0, '普通话': 0, '不能': 0, '以上': 0, '一般': 0, '无神论': 0, '建在': 0, '退休金': 0 } # 噪声词 wc = WordCloud( scale=4, # 清晰度 background_color='white', # 背景颜色 max_words=400, # 最多单词 width=1000, height=800, mask=img_array, font_path=font, stopwords=stopwords # 停用词 ) wc.generate_from_text(string) # 绘制图片 plt.imshow(www.chaoyuepint.com) plt.axis('off') plt.figure() plt.show() # 显示图片 wc.to_file('./conf/个人信息.png') # 保存图片if __name__ == '__main__':
cloud = Cloud() cloud.get_cloud()主函数
'''文件目录:Spider.py'''from yc.MongoClass import MongoOptfrom yc.SpiderClass import SpiderOptfrom yc.CloudClass import Cloudclass Spider:
def spider(self, num): try: headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Connection": "keep-alive", "Cookie": "PHPSESSID=adgqumos4shf5pj3tkvfuohhs6; yc123_loveyun_loveloginMember=AAIGDhpVVQIHBggHWl1RUVMFAwcHAF8EUgEKUAFSXVZXV1BXVw%3D%3D; yc123_loveyun_showhxtoday=NA%3D%3D; yc123_loveyun_showweixinpushtoday=NA%3D%3D;", "Host": www.yifayule2d.com"love.yc123.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36", } spider = SpiderOpt(www.yifa5yl.com'GET', 'https://www.zzhehong.com love.yc123.com/show.php?id=' + str(num), headers=headers) spider.get_response() person_details = spider.get_person_details() # family_love = spider.get_family_love() love_request = spider.get_love_request() # jpg_src = spider.get_jpg_src() result = {} # result.update(person_details) # 个人情况 # result.update(family_love) # 家庭情况 result.update(love_request) # 择偶要求 # result.update({"相片": jpg_src}) mongo = MongoOpt() mongo.add_collection(result) return True except: return Falseif __name__ == '__main__':
spider = Spider() mongo = MongoOpt() cloud = Cloud() # x, y = 0, 0 # for i in range(3001, 4720): # if not spider.spider(i): # x += 1 # print(i, "爬取无效") # continue # y += 1 # print(i, "爬取完成") # print(x, y, sep=" ") text = mongo.get_all_collections() cloud.get_cloud(text)