博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Python3+Wordcloud 实现单身相亲网站词云分析
阅读量:4987 次
发布时间:2019-06-12

本文共 5268 字,大约阅读时间需要 17 分钟。

MongoDB封装,实现数据存储

'''
存储文件:MongoClass.py
'''
import pymongo
import traceback

class MongoOpt:

__client = ''
__db = ''
__col = ''
def __init__(self):
try:
self.__client = pymongo.MongoClient("mongodb://127.0.0.1:27017/")
self.__db = self.__client['geng']
self.__col = self.__db['gerenqingkuang']
except:
traceback.print_exc()

def add_collection(self, info):

try:
self.__col.insert_one(info)
except:
traceback.print_exc()

def get_all_collections(self):

try:

value = []

text = ''
for r in self.__col.find({},{"_id": 0, "相片": 0}):
value.extend(r.values())
for v in value:
text += str(v)
with open("./conf/233.txt", 'w', encoding='utf-8') as txt:
txt.write(text)
return text
except:
traceback.print_exc()

if __name__ == '__main__':

mo = MongoOpt()
# mo.add_collection({'name':"geng", 'sex':'m'})
res = mo.get_all_collections()
print( res)
爬虫封装,实现数据爬取
'''
文件目录:SpiderClass.py
'''
import requests
from bs4 import BeautifulSoup as bs
import re

class SpiderOpt:

def __init__(self, method, url, **kwargs):
self.__response = requests.request(method, url, **kwargs)

def get_response(self):

self.__res = self.__response.text
self.__bs = bs(self.__res, 'html.parser')

def get_person_details(self):

details = self.__bs.find_all(class_="newshow2")
keys = []
values = []
for detail in details:
keys.extend(detail.find_all(class_='s1'))
values.extend(detail.find_all(class_='txt'))

# re去除html标签

re_html = re.compile('<[^<]+?>')
for i in range(len(keys)):
keys[i] = re_html.sub('', str(keys[i])).split(':')[0]
for j in range(len(values)):
values[j] = re_html.sub('', str(values[j])).split()[0]
return dict(zip(keys, values))

def get_family_love(self):

details = self.__bs.find_all(class_="newshow3")
keys = []
values = []
for detail in details:
keys.extend(detail.find_all(class_='s1'))
values.extend(detail.find_all(class_='txt'))

# re去除html标签

re_html = re.compile('<[^<]+?>')
for i in range(len(keys)):
keys[i] = re_html.sub('', str(keys[i])).split(':')[0]
for j in range(len(values)):
values[j] = re_html.sub('', str(values[j])).split()[0]
return dict(zip(keys, values))

def get_love_request(self):

table = self.__bs.find(class_="tableB")
tds = table.find_all('td')
keys = []
values = []
# re去除html标签
re_html = re.compile('<[^<]+?>')
for i in range(len(tds)):
s = re_html.sub(www.cjyl1yule.com'', str(tds[i])).split(':')[0]
if i % 2 == 0:
keys.append(s)
else:
values.append(s)
return dict(zip(keys, values))

def get_jpg_src(self):

photo = self.__bs.find(class_=www.oushengyule.com"love_photo")
jpg_src = photo.find('a')
return jpg_src['href']
词云封装,实现词图生成
'''
文件目录:CloudClass.py
'''
from wordcloud import WordCloud
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import jieba

class Cloud:

def __init__(self):
pass

def get_cloud(self, text):

# path = r'./'
font = r'./conf/FZSTK.TTF' # 字体path
text = (open('./conf/233.txt', 'r'www.zheshengyuLe.com, encoding='utf-8')).read() # 如果是中文的话encoding换成utf8
cut = jieba.cut(text) # 分词
string = ' '.join(cut)
print(len(string))
img = Image.open('./conf/timg.jpg') # 打开图片
img_array = np.array(img) # 将图片装换为数组
stopwords = {'要求': 0, '汉族': 0, '厘米': 0, '公斤': 0, '父亲': 0, '母亲': 0, '父母': 0,
'随意': 0, '其他': 0, '退休': 0, '对方': 0, '中文': 0, '普通话': 0, '不能': 0,
'以上': 0, '一般': 0, '无神论': 0, '建在': 0, '退休金': 0
} # 噪声词
wc = WordCloud(
scale=4, # 清晰度
background_color='white', # 背景颜色
max_words=400, # 最多单词
width=1000,
height=800,
mask=img_array,
font_path=font,
stopwords=stopwords # 停用词
)
wc.generate_from_text(string) # 绘制图片
plt.imshow(www.chaoyuepint.com)
plt.axis('off')
plt.figure()
plt.show() # 显示图片
wc.to_file('./conf/个人信息.png') # 保存图片

if __name__ == '__main__':

cloud = Cloud()
cloud.get_cloud()

主函数

'''
文件目录:Spider.py
'''
from yc.MongoClass import MongoOpt
from yc.SpiderClass import SpiderOpt
from yc.CloudClass import Cloud

class Spider:

def spider(self, num):
try:
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Connection": "keep-alive",
"Cookie": "PHPSESSID=adgqumos4shf5pj3tkvfuohhs6; yc123_loveyun_loveloginMember=AAIGDhpVVQIHBggHWl1RUVMFAwcHAF8EUgEKUAFSXVZXV1BXVw%3D%3D; yc123_loveyun_showhxtoday=NA%3D%3D; yc123_loveyun_showweixinpushtoday=NA%3D%3D;",
"Host": www.yifayule2d.com"love.yc123.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
}
spider = SpiderOpt(www.yifa5yl.com'GET', 'https://www.zzhehong.com love.yc123.com/show.php?id=' + str(num), headers=headers)
spider.get_response()
person_details = spider.get_person_details()
# family_love = spider.get_family_love()
love_request = spider.get_love_request()
# jpg_src = spider.get_jpg_src()
result = {}
# result.update(person_details) # 个人情况
# result.update(family_love) # 家庭情况
result.update(love_request) # 择偶要求
# result.update({"相片": jpg_src})
mongo = MongoOpt()
mongo.add_collection(result)
return True
except:
return False

if __name__ == '__main__':

spider = Spider()
mongo = MongoOpt()
cloud = Cloud()
# x, y = 0, 0
# for i in range(3001, 4720):
# if not spider.spider(i):
# x += 1
# print(i, "爬取无效")
# continue
# y += 1
# print(i, "爬取完成")
# print(x, y, sep=" ")
text = mongo.get_all_collections()
cloud.get_cloud(text)

转载于:https://www.cnblogs.com/qwangxiao/p/11148012.html

你可能感兴趣的文章
[LeetCode] 342. Power of Four 4的次方数
查看>>
with上下文管理器
查看>>
MySQL中 如何查询表名中包含某字段的表 ,查询MySql数据库架构信息:数据库,表,表字段...
查看>>
03-position和anchorPoint
查看>>
windows 下 nginx 的启动 停止 关闭
查看>>
Django 数据表更改
查看>>
java io读写文件
查看>>
【智能算法】粒子群寻优算法
查看>>
生活中一面,网上另一面
查看>>
[TensorFlow]TensorFlow安装方法
查看>>
机器学习实战-----八大分类器识别树叶带源码
查看>>
springMVC国际化配置和使用
查看>>
centos7下SVN服务器如何搭建
查看>>
CSS自学笔记(11):CSS3背景和边框
查看>>
EL表达式 (详解)
查看>>
重载和重写的区别
查看>>
跟我一起读postgresql源码(十二)——Executor(查询执行模块之——Materialization节点(下))...
查看>>
发音篇-第三章 自然发音法
查看>>
js计算滚动条高度及窗口高度
查看>>
这篇 感觉很实用--DJANGO ORM
查看>>