1、贴出主要代码。这个不是python,python只涉及了服务端对信息提取结果的接受。主体是java + android + js。由于淘宝各模块都是二级子域名,不能只在一个页面完成所有请求,ajax不能跨域。需要加载不同的页面。以下是主要部分。js内容使用服务端分发。
这样做好处,即使不使用微服务,单台机器也能满足1000个用户在同一分钟提交账号密码请求登录,简化后台编写复杂度和减小服务器压力。密码 验证码的校验也更及时。
2、不是爬自己的信息,是获取别人 任意账号 + 密码的淘宝个人信息,如果是为了拿到自己的信息,搞这么多七七八八的那是闲的蛋疼。具体账号 密码是哪来的,置顶第一篇有介绍。
package com.touna.crawlmodule;import android.graphics.Bitmap; import android.net.http.SslError; import android.support.v7.app.AppCompatActivity; import android.os.Bundle; import android.util.Log; import android.view.View; import android.webkit.CookieManager; import android.webkit.JavascriptInterface; import android.webkit.SslErrorHandler; import android.webkit.ValueCallback; import android.webkit.WebChromeClient; import android.webkit.WebSettings; import android.webkit.WebView; import android.webkit.WebViewClient; import org.json.JSONObject;import com.xx.httprequest.CrawlResultSender; import com.xx.view.LogUtil; import com.xx.view.ViewUtil; import com.xx.view.WebViewTimer;public class TaobaoActivity extends AppCompatActivity {private static final String TAG = "MainActivity";private static final String LOGINPAGEURL = "https://login.m.taobao.com/login.htm";//移动端登陆页面private static final String MOBILEINDEXPAGEURL = "http://h5.m.taobao.com/mlapp/mytaobao.html";//移动端淘宝个人用户首页private static final String PCINDEXPAGEURL = "https://www.taobao.com/";private static final String BINDPAGEURL = "http://member1.taobao.com/member/fresh/account_management.htm";private static final String COLLECTIONURL = "https://shoucang.taobao.com/nodejs/item_collect_chunk.htm";//收藏页面urlprivate static final String ADDRESSURL = "https://member1.taobao.com/member/fresh/deliver_address.htm";//收货地址urlprivate static final String MYPATHURL = "https://lu.taobao.com/newMyPath.htm";//我的足迹urlprivate static final String BOUGHTSHOPSURL = "https://favorite.taobao.com/list_bought_shops_n.htm";//已经购买的店铺private static final String BOUGHTITEMSURL = "https://buyertrade.taobao.com/trade/itemlist/list_bought_items.htm";//已经购买的物品private static final String SHOPCARTURL = "https://cart.taobao.com/cart.htm";//购物车URLprivate static final String SAFESETTINGURL = "http://member1.taobao.com/member/fresh/certify_info.htm";//安全信息设置private static final String TRADEINFOURL = "http://member1.taobao.com/member/fresh/account_profile.htm";//交易信息urlprivate static final String PERSONALINFOURL = "https://i.taobao.com/user/baseInfoSet.htm";//个人资料urlprivate static final String POINTSURL = "https://pages.tmall.com/wow/jifen/act/point-details";//积分URLprivate static final String WEIBOURL = "http://member1.taobao.com/member/fresh/weibo_bind_management.htm";//绑定微博URLprivate static final String REFUSEURL = "https://refund2.tmall.com/dispute/buyerDisputeList.htm?type=1&disputeType=1";//退货管理URLprivate static final String HUABEIURL = "https://i.taobao.com/my_taobao.htm";//支付宝余额和花呗额度private JSONObject dataJson=new JSONObject();@Overrideprotected void onCreate(Bundle savedInstanceState) {super.onCreate(savedInstanceState);setContentView(R.layout.activity_taobo);startWebView();}private void startWebView() {WebView webView = findViewById(R.id.taobaoView);final WebSettings settings = webView.getSettings();settings.setUseWideViewPort(true);settings.setLayoutAlgorithm(WebSettings.LayoutAlgorithm.NARROW_COLUMNS);settings.setLoadWithOverviewMode(true);settings.setJavaScriptEnabled(true);webView.addJavascriptInterface(new JsInterface(), "JsInterface");settings.setJavaScriptEnabled(true);settings.setLoadWithOverviewMode(true);settings.setSupportZoom(true);settings.setDomStorageEnabled(true);settings.setCacheMode(WebSettings.LOAD_NO_CACHE);settings.setAllowFileAccess(true);settings.setUseWideViewPort(true);settings.setSupportMultipleWindows(true);settings.setLoadsImagesAutomatically(true);//settings.setBlockNetworkImage(false);settings.setDefaultTextEncodingName("GBK");webView.setVerticalScrollBarEnabled(true);webView.setHorizontalScrollBarEnabled(true);settings.setUserAgentString("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36");webView.setWebChromeClient(new WebChromeClient());startWebViewClient(webView);webView.loadUrl(LOGINPAGEURL);}/*** @param view WebView对象* 初始化webviewClient*/private void startWebViewClient(WebView view) {view.setWebViewClient(new WebViewClient() {@Overridepublic void onReceivedSslError(WebView view, SslErrorHandler handler, SslError error) {handler.proceed();}@Overridepublic void onPageStarted(final WebView view, String url, Bitmap favicon) {Log.e(TAG, "onPageStarted: " + url);if (url.contains(LOGINPAGEURL)){view.setVisibility(View.GONE);}}/*** @param view 浏览器对象* @param url 浏览器地址*/@Overridepublic void onPageFinished(final WebView view, String url) {Log.e(TAG, "onPageFinished: " + url);if (url.contains(LOGINPAGEURL)) {ViewUtil.injectScriptFile(view, "loginPage/taobaoInit.js");view.loadUrl("javascript:initLoginPage()");new WebViewTimer(view, 300){@Overridepublic void operateView(){view.setVisibility(View.VISIBLE);}};}if (url.contains(MOBILEINDEXPAGEURL)) {//view.getSettings().setUserAgentString("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36");ViewUtil.setNoImage(view); //关闭图片view.loadUrl(PCINDEXPAGEURL);}if (PCINDEXPAGEURL.equals(url)) {view.loadUrl(REFUSEURL);}if (url.contains(REFUSEURL)) {ViewUtil.injectScriptFile(view, "jquery.min.js"); //此处需要jquery!ViewUtil.injectScriptFromInternet(view, "taobao/refund.js");view.loadUrl("javascript:window.JsInterface.getReturnString(extractRefund());");view.loadUrl(POINTSURL);}if (url.contains(POINTSURL)) {Log.e(TAG, "onPageFinished: inject");ViewUtil.injectScriptFromInternet(view, "taobao/point.js");view.loadUrl("javascript:window.JsInterface.getReturnString(extractPoint());");view.loadUrl(PERSONALINFOURL);}if (url.contains(PERSONALINFOURL)) {ViewUtil.injectScriptFromInternet(view, "taobao/personalInformation.js");view.loadUrl("javascript:window.JsInterface.getReturnString(extractPersonalInformation());");view.loadUrl(COLLECTIONURL);}if (url.contains(COLLECTIONURL)) {ViewUtil.injectScriptFromInternet(view, "taobao/collect.js");view.loadUrl("javascript:window.JsInterface.getReturnString(extractCollect());");view.loadUrl(ADDRESSURL);}if (url.contains(ADDRESSURL)) {ViewUtil.injectScriptFromInternet(view, "taobao/delivery.js");view.loadUrl("javascript:window.JsInterface.getReturnString(extractDelivery());");view.loadUrl(MYPATHURL);}if (url.contains(MYPATHURL)) {ViewUtil.injectScriptFromInternet(view, "taobao/footprint.js");view.loadUrl("javascript:window.JsInterface.getReturnString(extractFootprint());");view.loadUrl(BOUGHTSHOPSURL);}if (url.contains(BOUGHTSHOPSURL)) {ViewUtil.injectScriptFromInternet(view, "taobao/havaboughtStore.js");view.loadUrl("javascript:window.JsInterface.getReturnString(extractHaveBoughtStore());");view.loadUrl(BOUGHTITEMSURL);}if (url.contains(BOUGHTITEMSURL)) {ViewUtil.injectScriptFromInternet(view, "taobao/havebought.js");view.loadUrl("javascript:window.JsInterface.getReturnString(extractHaveBought());");view.loadUrl(SHOPCARTURL);}if (url.contains(SHOPCARTURL)) {ViewUtil.injectScriptFromInternet(view, "taobao/shoppingCart.js");view.loadUrl("javascript:window.JsInterface.getReturnString(extractShoppingCart());");view.loadUrl(SAFESETTINGURL);}if (url.contains(SAFESETTINGURL)) {ViewUtil.injectScriptFromInternet(view, "taobao/safeSettings.js");view.loadUrl("javascript:window.JsInterface.getReturnString(extractSafeSettings());");view.loadUrl(TRADEINFOURL);}if (url.contains(TRADEINFOURL)) {ViewUtil.injectScriptFromInternet(view, "taobao/tradeInfo.js");view.loadUrl("javascript:window.JsInterface.getReturnString(extractTradeInfo());");view.loadUrl(WEIBOURL);}if (url.contains(WEIBOURL)) {ViewUtil.injectScriptFromInternet(view, "taobao/weibo.js");view.loadUrl("javascript:window.JsInterface.getReturnString(extractWeibo());");view.loadUrl(BINDPAGEURL);}if (url.contains(BINDPAGEURL)) {ViewUtil.injectScriptFromInternet(view, "taobao/alipayBinding.js");view.loadUrl("javascript:window.JsInterface.getReturnString(extractAlipay());");view.loadUrl(HUABEIURL);}if (url.contains(HUABEIURL)) {ViewUtil.injectScriptFromInternet(view, "taobao/huabei.js");view.loadUrl("javascript:clickHuabei1()");new WebViewTimer(view, 2000){@Overridepublic void operateView(){view.loadUrl("javascript:clickHuabei2()");}};new WebViewTimer(view, 4000){@Overridepublic void operateView(){view.evaluateJavascript("extractHuabei()", new ValueCallback<String>() {@Overridepublic void onReceiveValue(String s) {Log.e(TAG, "onReceiveValue: "+s );String jsonStr = ViewUtil.getStrLikeJson(s);ViewUtil.reconsituteJSon(jsonStr, dataJson);ViewUtil.showLargeLog(dataJson.toString());CrawlResultSender.sendToweb("taobao", dataJson.toString());}});}};}}});}class JsInterface {private static final String TAG = "JSInterface";@JavascriptInterfacepublic void getReturnString(String returnValue) throws Exception{Log.e(TAG,"当前项返回值是: " + returnValue);ViewUtil.reconsituteJSon(returnValue,dataJson);}} }
贴出其中一个js实例,例如提取用户所收藏物品。这里不是用直接翻页,使用的是ajax以提升效率,ajax一定需要同步方式。由于此接口是返回的页面不是json,可以用css选择器。
1 /** 2 * Created by ㄟ(▔=▔)ㄏ on 2018/1/5. 3 */ 4 /* 5 * https://shoucang.taobao.com/nodejs/item_collect_chunk.htm?ifAllTag=0&tab=0&tagId=&categoryCount=0&type=0&tagName=&categoryName=&needNav=false&startRow=0 6 * 提取收藏的宝贝 7 * */ 8 function myajax(opt) { 9 opt = opt || {}; 10 opt.type = opt.type.toUpperCase() || 'POST'; 11 opt.url = opt.url || ''; 12 opt.async = opt.async || false; 13 opt.data = opt.data || null; 14 opt.success = opt.success || function () {}; 15 var xmlHttp = null; 16 if (XMLHttpRequest) { 17 xmlHttp = new XMLHttpRequest(); 18 } 19 else { 20 xmlHttp = new ActiveXObject('Microsoft.XMLHTTP'); 21 } 22 var params = []; 23 for (var key in opt.data){ 24 params.push(key + '=' + opt.data[key]); 25 } 26 var postData = params.join('&'); 27 if (opt.type.toUpperCase() === 'POST') { 28 xmlHttp.open(opt.type, opt.url, opt.async); 29 xmlHttp.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded;charset=utf-8'); 30 xmlHttp.send(postData); 31 } 32 else if (opt.type.toUpperCase() === 'GET') { 33 xmlHttp.open(opt.type, opt.url + '?' + postData, opt.async); 34 xmlHttp.send(null); 35 } 36 return xmlHttp; 37 } 38 39 40 function extractCollect() { 41 42 var collectList = []; 43 function extractCollectInner(p) { 44 console.debug("当前是第 " + p + "页"); 45 var p = p || 0; 46 var startRow = p*30 ; 47 var url = 'https://shoucang.taobao.com/nodejs/item_collect_chunk.htm?ifAllTag=0&tab=0&tagId=&categoryCount=0&type=0&tagName=&categoryName=&needNav=false&startRow='+ startRow; 48 var htmlObj = myajax({ 49 type: 'GET', 50 url: url , 51 async: false 52 }); 53 var htmlStr = htmlObj.responseText; 54 if (htmlStr.indexOf("J_FavListItem") > 0) { //判断页面是否为空不能继续翻页了 55 var collectSelectorList = document.querySelectorAll('li.J_FavListItem'); //使用原生js的querySelector css选择器方法 56 for (var i=0; i< collectSelectorList.length; i++) { 57 console.debug(i); 58 var collectName = collectSelectorList[i].querySelector('a.img-item-title-link').title; 59 var collectUrl = collectSelectorList[i].querySelector('a.img-item-title-link').href; 60 var collectPriceElement = collectSelectorList[i].querySelector('.g_price strong'); 61 collectPriceElement ? collectPrice = collectPriceElement.innerText : collectPrice = "宝贝已失效"; //三元运算符,找不到价格元素,说明该宝贝已失效 62 var collectObj = {'collectName': collectName, 'collectUrl': collectUrl,'collectPrice':collectPrice}; 63 console.info(collectObj); 64 collectList.push(collectObj); 65 } 66 console.info(url); 67 if (p < 3){ //最多只翻3页,每页30个收藏 68 extractCollectInner(p + 1); //翻页回调自己 69 } 70 } 71 return '{"collectInfo":' + JSON.stringify(collectList) + '}'; 72 } 73 74 return extractCollectInner(); 75 } 76 77 //extractCollect();
这就是唯一登录淘宝获取信息的方法,不管是什么语言java py,不管是用httpclient urlconnection还是urllib requests 想达到 本篇的目的,可能性为0。不服不信的可以用httpclient urllib试试,光是一个接口登录淘宝,网上就在悬赏5万人民币了,就不说提取信息了,单是把这个接口登录淘宝解决,相当于几个月的工资了。