目录
亚马逊中国站获取全部商品分类
亚马逊中国站获取商品列表
亚马逊中国站通过ASIN获取商品信息
亚马逊中国站获取商品库存信息
亚马逊国际站获取全部商品分类
亚马逊国际站获取商品列表
亚马逊国际站处理图形验证码
亚马逊国际站通过ASIN获取商品信息
亚马逊国际站获取商品库存信息
所提供代码已经为可运行代码,但亚马逊响应数据随时会变,造成解析异常。如果使用期间遇到问题,欢迎随时沟通。可扫描下方二维码公众号留言。
出现验证码情形
一般来说,如果相同ip请求过于频繁,就会出现图形验证码,如果使用不同的user-agent来请求,也可以降低出现验证码的频率,但是不能完全避免。
处理方式测试通过的有两种方式。
方式一:使用网络代理,比如10分钟内有效的代理,每十分钟更换一次,基本上能处理掉验证码的问题。如果还有就5分钟更换一次。该方式较为简单不再提供代码。
方式二:识别出图形验证码并携带cookie重新请求。需要注意的是,识别后进入的页面不一定是出现验证码之前既定进入的,所以最好是携带cookie重新请求一次。
图形验证码
市面上有多种图形验证码识别软件,之前使用过一款exe直接启动就能识别的,识别亚马逊的验证码一点问题也没有。不过鉴于对系统限制必须用windows,所以暂时不考虑了。
本篇使用的是图鉴(http://ttshitu.com/),识别效果不做保证,主要是便宜。
测试说明
测试请求商品列表信息的前面100页,一般来说,一个从来没有请求过的ip连续调用五六十次也会出现验证码。
Java代码
已经替换了验证码识别网站的账号密码,使用前需自行替换。
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;import javax.imageio.stream.FileImageOutputStream;import org.apache.http.HttpResponse;
import org.apache.http.client.CookieStore;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;import com.alibaba.fastjson.JSONObject;public class AmazonTest7 {public static void main(String[] args) throws Exception {CookieStore store = new BasicCookieStore();CloseableHttpClient httpclient = HttpClients.custom().setDefaultCookieStore(store).build();String url = "https://www.amazon.com/-/zh/s?bbn=16225009011&rh=i%3Aspecialty-aps%2Cn%3A%2116225009011%2Cn%3A281407&ref_=nav_em__nav_desktop_sa_intl_accessories_and_supplies_0_2_5_2";for (int i = 1; i <= 100; i++) {System.out.println(url);url = printInfo(url, httpclient);}}static int i = 1;// 返回的是下一页的urlstatic String printInfo(String url, CloseableHttpClient httpclient) throws Exception {HttpGet get = new HttpGet(url);get.addHeader("accept-language", "zh-CN,zh;q=0.9,en;q=0.8");get.addHeader("user-agent","Mozilla/5.0(Macintosh;IntelMacOSX10_13_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/81.0.4044.138Safari/537.36");CloseableHttpResponse rese = httpclient.execute(get);String redsa = EntityUtils.toString(rese.getEntity());Document doc = Jsoup.parse(redsa);if (doc.toString().contains("Type the characters you see in this image:")) { // 有验证码while(true) {String code = checkCode(doc);System.out.println(code);Element eleForm = doc.getElementsByTag("form").first();String amzn = eleForm.getElementsByTag("input").first().val();String amznr = eleForm.getElementsByTag("input").get(1).val();String url2 = "https://www.amazon.com" + eleForm.attr("action") + "?amzn=" + amzn + "&amzn-r=" + amznr+ "&field-keywords=" + code;get = new HttpGet(url2);rese = httpclient.execute(get);redsa = EntityUtils.toString(rese.getEntity());doc = Jsoup.parse(redsa);if(!redsa.contains("Type the characters you see in this image:")) {break;}}get = new HttpGet(url);get.addHeader("accept-language", "zh-CN,zh;q=0.9,en;q=0.8");get.addHeader("user-agent","Mozilla/5.0(Macintosh;IntelMacOSX10_13_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/81.0.4044.138Safari/537.36");rese = httpclient.execute(get);redsa = EntityUtils.toString(rese.getEntity());doc = Jsoup.parse(redsa);}Elements goodsEles = doc.getElementsByClass("sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 sg-col s-widget-spacing-small sg-col-4-of-20");for (Element goodsEle : goodsEles) {System.out.println("商品" + i++);String detailUrl = "https://www.amazon.cn" + goodsEle.getElementsByTag("a").first().attr("href");System.out.println("商品详情:" + detailUrl);String asin = goodsEle.attr("data-asin");System.out.println("ASIN:" + asin);String uuid = goodsEle.attr("data-uuid");System.out.println("UUID:" + uuid);String img = goodsEle.getElementsByTag("img").first().attr("src");System.out.println("封面图片:" + img);String name = goodsEle.getElementsByTag("h2").first().text();System.out.println("名称:" + name);Element starEle = goodsEle.getElementsByClass("a-icon-alt").first();if (starEle != null) {String star = starEle.text();System.out.println("评分:" + star);String count = goodsEle.getElementsByClass("a-section a-spacing-none a-spacing-top-micro").first().getElementsByClass("a-size-base").first().text().replaceAll(",", "");System.out.println("评价人数:" + count);} else {System.out.println("暂无评分");System.out.println("评价人数:0");}Element priceEle = goodsEle.getElementsByClass("a-offscreen").first();if (priceEle != null) {String price = priceEle.text().replaceAll(",", "");System.out.println("价格:" + price);} else {System.out.println("价格:列表未显示价格,可能无货");}System.out.println("\n===================================\n");}FileWriter fw = new FileWriter("/Users/admin/Desktop/ac.html", false);BufferedWriter bw = new BufferedWriter(fw);bw.newLine();bw.write(doc.toString());bw.close();fw.close();String nextUrl = "https://www.amazon.com"+ doc.getElementsByClass("a-last").first().getElementsByTag("a").first().attr("href");return nextUrl;}private static String checkCode(Document doc) throws Exception {String img = doc.getElementsByClass("a-row a-text-center").first().getElementsByTag("img").first().attr("src");System.out.println(img);HttpGet get = new HttpGet(img);CloseableHttpClient httpclient = HttpClients.custom().build();HttpResponse response = httpclient.execute(get);byte[] data1 = EntityUtils.toByteArray(response.getEntity());FileImageOutputStream imageOutput = new FileImageOutputStream(new File("amazonCode.jpg"));imageOutput.write(data1, 0, data1.length);imageOutput.close();return readCode();}private static String readCode() throws Exception {String imgCode = "";String username = "【替换为用户名】";String password = "【替换为密码】";InputStream inputStream = null;File needRecoImage = new File("amazonCode.jpg");inputStream = new FileInputStream(needRecoImage);Map<String, String> data = new HashMap<>();data.put("username", username);data.put("password", password);data.put("typeid", "1002");String resultString = Jsoup.connect("http://api.ttshitu.com/create.json").data(data).data("image", "amazonCode.jpg", inputStream).ignoreContentType(true).post().text();Map<String, Object> map = JSONObject.parseObject(resultString);if ((Boolean) map.get("success")) {Map<String, Object> map1 = JSONObject.parseObject(String.valueOf(map.get("data")));imgCode = (String) map1.get("result");}return imgCode;}}