当前位置: 首页 > news >正文

河北网站开发互联网推广引流是做什么的

河北网站开发,互联网推广引流是做什么的,江西求做网站,天津狐臭在哪里做津门网站I之前用python写过爬虫,这次想试试nodeJS爬虫爬取贴吧图片,话不多说代码如下,爬取制定吧的前十页所有帖子里的图片 爬取贴吧图片脚本 你得提前创建一个images文件夹 const axios require("axios"); const cheerio require("…

之前用python写过爬虫,这次想试试nodeJS爬虫爬取贴吧图片,话不多说代码如下,爬取制定吧的前十页所有帖子里的图片

 爬取贴吧图片脚本

你得提前创建一个images文件夹

const axios = require("axios");
const cheerio = require("cheerio");
const sanitize = require("sanitize-filename");
const fs = require("fs");
const path = require("path");// 定义要爬取的贴吧URL
const baseUrl = "https://tieba.baidu.com/f?kw=%CB%EF%D0%A6%B4%A8&fr=ala0&tpl=5&dyTabStr=MCwxLDMsMiw2LDQsNSw4LDcsOQ%3D%3D";// 发送HTTP请求获取页面内容
async function getTitlesByPage(pageNum) {const url = baseUrl + pageNum * 50;try {const response = await axios.get(url);if (response.status === 200) {// 使用cheerio解析页面const $ = cheerio.load(response.data);$(".threadlist_title a.j_th_tit").each((index, element) => {// 定义要下载的帖子URLconst url = "https://jump2.bdimg.com" + $(element).attr("href");// 发送HTTP请求获取页面内容axios.get(url).then((response) => {if (response.status === 200) {// 使用cheerio解析页面const $ = cheerio.load(response.data);// 获取帖子中的所有图片链接const imgUrls = [];$("img.BDE_Image").each((index, element) => {imgUrls.push($(element).attr("src"));});// 下载所有图片imgUrls.forEach((imgUrl, index) => {axios({method: "get",url: imgUrl,responseType: "stream",headers: {Referer: url,},}).then((response) => {const filename = sanitize(path.basename(imgUrl));const filePath = path.resolve(__dirname,`./images/${filename}.jpg`);response.data.pipe(fs.createWriteStream(filePath));console.log(`第 ${index + 1} 张图片下载完成`);}).catch((error) => {console.log(`第 ${index + 1} 张图片下载失败`, error);});});} else {console.log("请求失败");}}).catch((error) => {console.log("请求出错", error);});});} else {console.log(`请求第 ${pageNum + 1} 页失败`);}} catch (error) {console.log(`请求第 ${pageNum + 1} 页出错`, error);}
}async function getTitles() {for (let i = 0; i < 10; i++) {await getTitlesByPage(i);}
}getTitles();

这里有个弊端,IP会被马上封掉,那么通过爬取免费代理IP网站的IP去创建本地代理IP池txt文件

找了一个勉强可用的免费代理IP网站免费代理IP_免费HTTP代理IP_SOCKS5代理服务器_优质IP代理_89免费代理IP

里面的有效IP很少,那么得自己去大量爬取筛选可用IP

 这个是

爬取建立免费代理IP池的脚本

你得提前创建一个proxy.txt文件

const fs = require('fs');
const axios = require('axios');
const cheerio = require('cheerio');const headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36',
};async function get89IP(filePath) {for (let i = 1; i <= 10; i++) { // 循环采集前10页的数据const url = `https://www.89ip.cn/index_${i}.html`;try {const response = await axios.get(url, { headers });const $ = cheerio.load(response.data);const trs = $('table tbody tr');trs.each((index, element) => {const ip = $(element).find('td:nth-child(1)').text().trim();const port = $(element).find('td:nth-child(2)').text().trim();const proxyIP = `${ip}:${port}`;fs.appendFileSync(filePath, proxyIP + '\n');});console.log(`第${i}页采集完成`);} catch (error) {console.error('出错了:', error);}await new Promise((resolve) => setTimeout(resolve, 1000));}
}async function main() {const filePath = './proxy.txt';while (true) {try {await get89IP(filePath);console.log('采集完成');} catch (error) {console.error('出错了:', error);}await new Promise((resolve) => setTimeout(resolve, 60000));}
}main();

 采集完成后的筛选IP代码

 一个一个筛选太慢,这里使用到了Promise.all

你得提前创建一个KyProxy.txt文件

const fs = require('fs');
const axios = require('axios');const proxyList = fs.readFileSync('proxy.txt', 'utf-8').split('\n').filter(Boolean);async function testProxy(ip) {try {const response = await axios.get('https://tieba.baidu.com/', {proxy: {host: ip.split(':')[0],port: ip.split(':')[1]},timeout: 5000});if (response.status === 200 || response.status === 302) {return true;}} catch (error) {console.error(error);}return false;
}async function main() {const promiseArr = [];for (const proxy of proxyList) {promiseArr.push(testProxy(proxy));}const resultArr = await Promise.all(promiseArr);const validProxies = resultArr.reduce((acc, curr, index) => {if (curr) {acc.push(proxyList[index]);console.log(`代理IP ${proxyList[index]} 可用`);} else {console.log(`代理IP ${proxyList[index]} 不可用`);}return acc;}, []);fs.writeFileSync('kyProxy.txt', validProxies.join('\n'));console.log('可用代理IP已写入 kyProxy.txt');
}main().catch((error) => console.error(error));

 到这一步kyProxy.txt里面的IP基本是稳定可用的了,最后一步就是使用kyProxy.txt里的代理I去爬取图片

 通过代理IP爬取贴吧图片

const axios = require("axios");
const cheerio = require("cheerio");
const sanitize = require("sanitize-filename");
const fs = require("fs");
const path = require("path");// 定义要爬取的贴吧URL
const baseUrl ="https://tieba.baidu.com/f?kw=%CB%EF%D0%A6%B4%A8&fr=ala0&tpl=5&dyTabStr=MCwxLDMsMiw2LDQsNSw4LDcsOQ%3D%3D";// 获取代理IP池
async function getProxyList() {const fileContent = await fs.promises.readFile(path.resolve(__dirname, "./kyProxy.txt"),"utf8");return fileContent.trim().split("\n");
}// 发送HTTP请求获取页面内容
async function getTitlesByPage(pageNum, proxyList) {const url = baseUrl + pageNum * 50;try {let success = false;for (let i = 0; i < proxyList.length; i++) {const proxy = `${proxyList[i]}`;console.log(`使用代理IP:${proxy}`);try {const response = await axios.get(url, {proxy: {host: proxyList[i].split(":")[0],port: proxyList[i].split(":")[1],},});if (response.status === 200) {// 使用cheerio解析页面const $ = cheerio.load(response.data);$(".threadlist_title a.j_th_tit").each(async (index, element) => {// 定义要下载的帖子URLconst url = "https://jump2.bdimg.com" + $(element).attr("href");// 发送HTTP请求获取页面内容const imgUrls = await getImgUrls(url, proxy);// 下载所有图片for (let j = 0; j < imgUrls.length; j++) {await downloadImg(imgUrls[j], j, url, proxy);}});success = true;break;} else {console.log(`代理IP ${proxy} 请求失败`);}} catch (error) {console.log(`代理IP ${proxy} 请求出错`, error);}}if (!success) {console.log(`请求第 ${pageNum + 1} 页失败,跳过`);}} catch (error) {console.log(`请求第 ${pageNum + 1} 页出错`, error);}
}// 获取帖子中的所有图片链接
async function getImgUrls(url, proxy) {try {const response = await axios.get(url, {proxy: {host: proxy.split(":")[0],port: proxy.split(":")[1],},headers: {Referer: url,},});if (response.status === 200) {const $ = cheerio.load(response.data);const imgUrls = [];$("img.BDE_Image").each((index, element) => {imgUrls.push($(element).attr("src"));});return imgUrls;} else {console.log(`请求 ${url} 失败`);return [];}} catch (error) {console.log(`请求 ${url} 出错`, error);return [];}
}// 下载单张图片
async function downloadImg(imgUrl, index, url, proxy) {try {const response = await axios({method: "get",url: imgUrl,responseType: "stream",proxy: {host: proxy.split(":")[0],port: proxy.split(":")[1],},headers: {Referer: url,},});if (response.status === 200) {const filename = sanitize(path.basename(imgUrl));const filePath = path.resolve(__dirname, `./images/${filename}.jpg`);response.data.pipe(fs.createWriteStream(filePath));console.log(`第 ${index + 1} 张图片下载完成`);} else {console.log(`第 ${index + 1} 张图片下载失败`);}} catch (error) {console.log(`第 ${index + 1} 张图片下载出错`, error);}
}async function getTitles() {const proxyList = await getProxyList();for (let i = 0; i < 10; i++) {await getTitlesByPage(i, proxyList);}
}getTitles();

爬取效果

效果还可以


文章转载自:
http://dinncoshrievalty.bpmz.cn
http://dinncofash.bpmz.cn
http://dinncoperoneal.bpmz.cn
http://dinncoabask.bpmz.cn
http://dinncobern.bpmz.cn
http://dinncooctopamine.bpmz.cn
http://dinncoforbidding.bpmz.cn
http://dinncospindleshanks.bpmz.cn
http://dinncosneak.bpmz.cn
http://dinncocongratters.bpmz.cn
http://dinncoelectric.bpmz.cn
http://dinncofavorableness.bpmz.cn
http://dinncofiddle.bpmz.cn
http://dinncomeditation.bpmz.cn
http://dinnconephrectomize.bpmz.cn
http://dinncocolorimetric.bpmz.cn
http://dinncomillyum.bpmz.cn
http://dinncomost.bpmz.cn
http://dinncopollyanna.bpmz.cn
http://dinncoemulsin.bpmz.cn
http://dinncomacroinstruction.bpmz.cn
http://dinncoeyeless.bpmz.cn
http://dinncodiatomaceous.bpmz.cn
http://dinncosibilance.bpmz.cn
http://dinncoplenty.bpmz.cn
http://dinncoplagiary.bpmz.cn
http://dinncotoes.bpmz.cn
http://dinncorabbath.bpmz.cn
http://dinncopenninite.bpmz.cn
http://dinncochurning.bpmz.cn
http://dinncoromp.bpmz.cn
http://dinncofilmdom.bpmz.cn
http://dinncouddered.bpmz.cn
http://dinncoprizeless.bpmz.cn
http://dinnconubk.bpmz.cn
http://dinncoeurasiatic.bpmz.cn
http://dinncobuildable.bpmz.cn
http://dinncofusion.bpmz.cn
http://dinncoadperson.bpmz.cn
http://dinncofrivolous.bpmz.cn
http://dinncotonqua.bpmz.cn
http://dinncocodetta.bpmz.cn
http://dinncometallogenetic.bpmz.cn
http://dinncoinversive.bpmz.cn
http://dinncovenerably.bpmz.cn
http://dinncoannealing.bpmz.cn
http://dinncointerstice.bpmz.cn
http://dinncolyceum.bpmz.cn
http://dinncodirefully.bpmz.cn
http://dinncopyometra.bpmz.cn
http://dinncotemple.bpmz.cn
http://dinncocherrapunji.bpmz.cn
http://dinncoanaphylactin.bpmz.cn
http://dinncoladify.bpmz.cn
http://dinncocomtesse.bpmz.cn
http://dinncoiscariot.bpmz.cn
http://dinncoguanay.bpmz.cn
http://dinncoaquarii.bpmz.cn
http://dinncowinkle.bpmz.cn
http://dinncofabricate.bpmz.cn
http://dinncosemiaquatic.bpmz.cn
http://dinncoprovenly.bpmz.cn
http://dinncobodysurf.bpmz.cn
http://dinncorenominate.bpmz.cn
http://dinncoglumpy.bpmz.cn
http://dinncointercontinental.bpmz.cn
http://dinncowailful.bpmz.cn
http://dinncosuzerainty.bpmz.cn
http://dinncopaintbox.bpmz.cn
http://dinncodarwinist.bpmz.cn
http://dinncoautoexec.bpmz.cn
http://dinncostrawworm.bpmz.cn
http://dinncooverdose.bpmz.cn
http://dinncomonogenism.bpmz.cn
http://dinncoemploye.bpmz.cn
http://dinncohardhat.bpmz.cn
http://dinncocardoon.bpmz.cn
http://dinncocounterfactual.bpmz.cn
http://dinncoimputable.bpmz.cn
http://dinncoichnographically.bpmz.cn
http://dinncorebbitzin.bpmz.cn
http://dinncooverplaid.bpmz.cn
http://dinncominimill.bpmz.cn
http://dinncozoantharian.bpmz.cn
http://dinncomephitis.bpmz.cn
http://dinncofoliation.bpmz.cn
http://dinncosubtropics.bpmz.cn
http://dinncoepigenic.bpmz.cn
http://dinncosolutionist.bpmz.cn
http://dinncotennessean.bpmz.cn
http://dinncoscirrhus.bpmz.cn
http://dinncolarder.bpmz.cn
http://dinnconameboard.bpmz.cn
http://dinncoidiosyncratic.bpmz.cn
http://dinncocircumnutation.bpmz.cn
http://dinncofanconi.bpmz.cn
http://dinncocryptogram.bpmz.cn
http://dinncofundus.bpmz.cn
http://dinncospecky.bpmz.cn
http://dinncogrogshop.bpmz.cn
http://www.dinnco.com/news/131954.html

相关文章:

  • 建立手机个人网站常州seo关键词排名
  • 做网站推广要注意什么营销型网站建设要点
  • 深圳创业孵化基地入驻条件搜索引擎优化seo价位
  • 为什么要建设政府网站一级域名二级域名三级域名的区别
  • 网站备案号注销查询免费独立站自建站网站
  • 如何介绍网站模板下载地址seo如何建立优化网站
  • dreamweaver的简介网站seo搜索引擎优化教程
  • 艾迪网络专业的网站建设公司品牌策划方案模板
  • 如何给一个网站做定时的更新企业网站建设方案模板
  • 哈尔滨建站模板系统seo文章范文
  • 一键生成海报成都官网seo服务
  • 大浪做网站公司域名查询网
  • 网站汉英结合的怎么做百度关键词工具入口
  • 营口旅游网站建设seo外包如何
  • 印刷公司网站模板优化大师客服
  • 做网站服务器用谁的seo做的比较好的公司
  • 网站怎么更换域名seo名词解释
  • 做的好的茶叶网站好电商网站制作
  • 带地板翻转的网站怎么做电商运营seo
  • wordpress 安装 空白深圳seo网络优化公司
  • 网站主页设计收费适合seo软件
  • win7电脑做网站主机企业管理软件
  • 哪些网站可以在线做动图seo网站关键词排名快速
  • 网站源码安装步骤网站如何做seo排名
  • 河池城乡住房和建设局网站seo云优化软件破解版
  • 手机网站怎么建设关键词推广操作
  • 在国外做网站赌博犯法吗网站推广的一般流程是
  • 网站内页百度提交口网站定制开发
  • 西安网站运营招聘淘宝指数查询官网手机版
  • 建产品网站怎么做武汉seo百度