当前位置: 首页 > news >正文

谷歌广告优化师凤山网站seo

谷歌广告优化师,凤山网站seo,郑州做网站汉狮,本地使用宝塔安装wordpress内容仅供学习参考,如有侵权联系删除 先通过京东非自营的店铺名拿到的公司名,再通过公司名称去其他平台拿到联系方式(代码省略) from aioscrapy.spiders import Spider from aioscrapy.http import Request, FormRequest import dd…

内容仅供学习参考,如有侵权联系删除

先通过京东非自营的店铺名拿到的公司名,再通过公司名称去其他平台拿到联系方式(代码省略)


from aioscrapy.spiders import Spider
from aioscrapy.http import Request, FormRequest
import ddddocr
import re
import randomfrom loguru import loggerclass JingDongSpider(Spider):name = 'products:jd'custom_settings = {'CONCURRENT_REQUESTS': 4,# 'DOWNLOAD_DELAY': 0.5,'DOWNLOAD_TIMEOUT': 10,'RETRY_TIMES': 5,'HTTPERROR_ALLOWED_CODES': [503],'COOKIES_ENABLED': False,'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.redis.RFPDupeFilter',  # 过滤方法# 'LOG_LEVEL': 'DEBUG'}ocr = ddddocr.DdddOcr(show_ad=False, use_gpu=True)async def start_requests(self):yield Request(url=f"https://mall.jd.com/index-11111111.html?from=pc",method='GET',dont_filter=False,# fingerprint=str(i),# meta={"shop_id": str(i)},priority=500)async def parse(self, response):"""店铺首页"""title = response.xpath('//title/text()').get() or ''shop_id = str(response.meta['shop_id'])if '您所访问的页面不存在' in str(title) or len(response.text) < 25000:logger.info(f"{shop_id}")returnlogger.info(title.strip())product_list = self.get_product_items(response)urls = re.findall(r"//\w+\.jd\.com/view_search-\d+-\d+-\d+-\d+-\d+-\d+\.html", response.text)yield Request(url=f"https://mall.jd.com/sys/vc/createVerifyCode.html?random={random.random()}",method='GET',callback=self.parse_img_code,dont_filter=True,meta={"data": {"product_url": 'https:' + urls[0] if urls else '',"categorys": self.get_category(response),"product_list": product_list,# "shop_url": response.url,"shop_id": shop_id}},priority=500)async def parse_img_code(self, response):"""验证码"""code = self.ocr.classification(response.body)cookie = dict(response.cookies.items())shop_id = response.meta["data"]["shop_id"]if not code or not cookie:returnyield FormRequest(url=f'https://mall.jd.com/showLicence-{shop_id}.html',method='POST',formdata={"verifyCode": str(code)},cookies=cookie,meta={"data": response.meta["data"]},callback=self.parse_shop_detail,dont_filter=True,priority=400)async def parse_shop_detail(self, response):""" 解析店铺详情"""company = response.xpath('//*[contains(.,"企业名称:")]/following-sibling::span[position()=1]/text()').get() or ''shop_name = response.xpath('//*[contains(.,"店铺名称:")]/following-sibling::span[position()=1]//text()').get() or ''shop_url = response.xpath('//*[contains(.,"店铺网址:")]/following-sibling::span[position()=1]//text()').get()# legal_person = response.xpath( '//*[contains(.,"法定代表人姓名:")]/following-sibling::span[position()=1]//text()').get()# business_scope = response.xpath( '//*[contains(.,"营业执照经营范围:")]/following-sibling::span[position()=1]//text()').get()license = response.xpath('//img[@class="qualification-img"]/@src').get() or ''if not company or '测试' in shop_name or '测试' in company:if not company:logger.info(f"无公司: {response.url}")else:logger.info(f" {shop_name} => {company}")returnelse:logger.info(company)data = response.meta['data']data['company'] = companydata['shop_name'] = shop_nameitems = dict(company=company,shop_name=shop_name,shop_url='https:' + shop_url if shop_url else response.url,product_url=data['product_url'],shop_id=data['shop_id'],push_kafka_status=0,license='https:' + license if license else '',)if len(data['product_list']) < 1:if data['product_url']:yield Request(url=data['product_url'],method='GET',meta={"data": data},callback=self.parse_product,dont_filter=True,priority=300)else:logger.warning(f"获取不到产品链接:{response.url}")items.pop('product_url')yield itemselse:product_list = []for item in data['product_list']:item['entityId'] = companyproduct_list.append(item)yield dict(source='jd.com',ocid='',entityId=company,product=product_list,)items['push_kafka_status'] = 1yield itemsasync def parse_product(self, response):"""解析产品页"""data = response.meta['data']shop_name = data['shop_name']company = data['company']categorys = data['categorys']product_list = self.get_product_items(response, shop_name, company, categorys, data['product_url'])if product_list:yield dict(source='jd.com',ocid='',entityId=company,product=product_list,)logger.info(f"成功: {company} => {data['shop_id']}")yield dict(company=company,shop_id=data['shop_id'],push_kafka_status=1,)else:logger.error(f"{response.url} => {data['shop_id']}")def get_product_items(self, response, shop_name='', company='', categorys='', shop_url='') -> list:ul = response.xpath('//li[@class="jSubObject"] | //li[@class="jSubObject gl-item"] | //div[@class="jItem"]')product_list = []for li in ul[:10]:title = li.xpath('.//div[@class="jDesc"]/a/@title').get() or ''# price = li.xpath('.//span[@class="jdNum"]/text()').get()img = str(li.xpath('.//div[@class="jPic"]//img/@src').get() or '').replace('s350x350', '')if not title and not img:continueif img:img = re.sub(r"/n[23456789]/", "/n1/", img)img = 'https:' + imgitem_i = {}item_i["entityId"] = companyitem_i["productPic"] = img.replace('s350x350', '')item_i["productName"] = title  # 产品名称item_i["productCategory"] = ""  # 产品分类item_i["productKeyword"] = ""  # 产品关键词item_i["productPrice"] = ""  # 产品价格item_i["mainProducts"] = categorys  # 主营产品item_i["listingPlatform"] = "京东"item_i["productShopName"] = shop_name  # 产品所属店铺名item_i["dataLink"] = shop_url or response.url  # 店铺链接product_list.append(item_i)return product_list@staticmethoddef get_category(response) -> str:categorys = response.xpath('//ul[@class="menu-list"]/li[@class="menu"]/a/text() | //div[@class="abs"]//div[@class="ins abs hdur_2"]/a/text()').getall()category = []for i in categorys:if '首页' in i or '全部' in i or '所有' in i or '问题' in i or '指导' in i or '售后' in i or '撰文' in i:continuecategory.append(i)return ','.join(category)if __name__ == '__main__':JingDongSpider.start()

最后的数据

在这里插入图片描述

本内容仅限用于学习参考,不得用于商业目的。如有版权问题,请联系我们删除,谢谢!
欢迎一起学习讨论Q540513871

http://www.dinnco.com/news/47345.html

相关文章:

  • 外贸网站该怎么做杭州网站建设 seo
  • 个人网站也需要备案吗引擎优化seo是什么
  • 网站设置默认首页搜索seo神器
  • 酒类网站建设方案案互联网营销主要学什么
  • 毕业设计代做网站jsp竞价排名名词解释
  • 主题资源网站制作平台app开发网站
  • openshift 做网站郑州seo课程
  • 如何在自己电脑上做网站服务器百度推广是干什么的
  • 融媒体中心建设与政府网站seo排名优化的网站
  • 如何建立网站和网页网站建站哪家公司好
  • 阿里国际网站首页可以做全屏不顶尖文案
  • 怎么做网站写书千瓜数据
  • 学生做兼职去哪个网站网络营销推广策划的步骤是什么
  • 郑州网站建设 58深圳外包seo
  • 域名价格查询seo泛目录培训
  • 哈尔滨公司网站脱发严重是什么原因引起的
  • 合肥建筑材料市场信息价官网seox
  • 域名网站建设教程网络营销咨询公司
  • 专业图库网站 西安环球军事网最新军事新闻最新消息
  • 中国万网域名注册价格北京seoqq群
  • php做大型网站百度公司的发展历程
  • 怎么做免费网站教程图片外链生成工具
  • 大连网站开发工资在线咨询
  • 020网站建设和维护费用河南推广网站
  • 泉州网站制作专业友情链接有哪些
  • 网站维修合同华为手机业务最新消息
  • 宁波品牌网站设计价格引擎优化seo怎么做
  • 外加工接单平台济南优化网页
  • 关于做美食的网站网络营销的新特点
  • 小宽带怎样做视频网站郑州seo外包平台