如何避免被检测为傀儡和幻影的机器人?[英] How to avoid being detected as bot on Puppeteer and Phantomjs?

问题描述

瞳孔和幻影是相似的.我对两者发生的问题,代码也是相似的.

我想从网站上捕获一些信息,需要身份验证以查看这些信息.我甚至无法访问主页,因为它像"可疑活动"一样检测到,如SS: https://i.imgur.com/p69oijo.png

我发现,当我使用名为 cookie

我能做什么绕过这个?

// Simple Javascript example
var page = require('webpage').create();
var url = 'https://www.expertflyer.com';

page.open(url, function (status) {
    if( status === "success") {
        page.render("home.png");
        phantom.exit();
    }
});

推荐答案

如果有人在将来需要同样的问题. 使用木偶 - 额外

我已经在服务器上测试了代码.第二次运行有Google Captcha.您可以解决自己的自我并重新启动机器人或使用CAPTCHA解决服务.

我确实运行了超过10次的代码,没有IP禁令.我继续在继续运行时再次获得CAPTCHA.

但你可以再次获得CAPTCHA!

//sudo npm install puppeteer puppeteer-extra puppeteer-extra-plugin-stealth puppeteer-extra-plugin-adblocker readline
var headless_mode = process.argv[2]

const readline = require('readline');
const puppeteer = require('puppeteer-extra')
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin())
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker')
puppeteer.use(AdblockerPlugin({ blockTrackers: true }))


async function run () {
  const browser = await puppeteer.launch({
    headless:(headless_mode !== 'true')? false : true,
    ignoreHTTPSErrors: true,
    slowMo: 0,
    args: ['--window-size=1400,900',
    '--remote-debugging-port=9222',
    "--remote-debugging-address=0.0.0.0", // You know what your doing?
    '--disable-gpu', "--disable-features=IsolateOrigins,site-per-process", '--blink-settings=imagesEnabled=true'
    ]})

  const page = await browser.newPage();


  console.log(`Testing expertflyer.com`)
  //await page.goto('https://www.expertflyer.com')
  await goto_Page('https://www.expertflyer.com')
  await waitForNetworkIdle(page, 3000, 0)
  //await page.waitFor(7000)
  await checking_error(do_2nd_part)




  async function do_2nd_part(){
    try{await page.click('#yui-gen2 > a')}catch{}
    await page.waitFor(5000)
    var seat = '#headerTitleContainer > h1'
    try{console.log(await page.$eval(seat, e => e.innerText))}catch{}
    await page.screenshot({ path: 'expertflyer1.png'})

    await checking_error(do_3nd_part)
  }

  async function do_3nd_part(){
    try{await page.click('#yui-gen1 > a')}catch{}
    await page.waitFor(5000)
    var pro = '#headerTitleContainer > h1'
    try{console.log(await page.$eval(pro, e => e.innerText))}catch{}
    await page.screenshot({ path: 'expertflyer2.png'})

    console.log(`All done, check the screenshots?`)
  }


  async function checking_error(callback){
    try{
      try{var error_found = await page.evaluate(() => document.querySelectorAll('a[class="text yuimenubaritemlabel"]').length)}catch(error){console.log(`catch error ${error}`)}

      if (error_found === 0) {
        console.log(`Error found`)
        var captcha_msg = "Due to suspicious activity from your computer, we have blocked your access to ExpertFlyer. After completing the CAPTCHA below, you will immediately regain access unless further suspicious behavior is detected."
        var ip_blocked = "Due to recent suspicious activity from your computer, we have blocked your access to ExpertFlyer. If you feel this block is in error, please contact us using the form below."
        try{var error_msg = await page.$eval('h2', e => e.innerText)}catch{}
        try{var error_msg_details = await page.$eval('body > p:nth-child(2)', e => e.innerText)}catch{}

        if (error_msg_details == captcha_msg) {
          console.log(`Google Captcha found, You have to solve the captch here manually or some automation recaptcha service`)

          await verify_User_answer()
          await callback()
        } else if (error_msg_details == ip_blocked) {
          console.log(`The current ip address is blocked. The only way is change the ip address.`)
        } else {
          console.log(`Waiting for error page load... Waiting for 10 sec before rechecking...`)
          await page.waitFor(10000)
          await checking_error()
        }

      } else {
        console.log(`Page loaded successfully! You can do things here.`)
        await callback()
      }

    }catch{}
  }

  async function goto_Page(page_URL){
    try{
      await page.goto(page_URL, { waitUntil: 'networkidle2', timeout: 30000 });
    } catch {
      console.log(`Error in loading page, re-trying...`)
      await goto_Page(page_URL)
    }
  }

  async function verify_User_answer(call_back){
      user_Answer = await readLine();

      if (user_Answer == 'yes') {
        console.log(`user_Answer is ${user_Answer}, Processing...`)
        // Not working what i want. Will fix later
        // Have to restart the bot after solving
        await call_back()
      } else {
        console.log(`answer not match. try again...`)

        var user_Answer = await readLine();
        console.log(`user_Answer is ${user_Answer}`)
        await verify_User_answer(call_back)
      }
    }

    async function readLine() {

      const rl = readline.createInterface({
        input: process.stdin,
        output: process.stdout
      });

      return new Promise(resolve => {

        rl.question('Solve the captcha and type yes to continue: ', (answer) => {
          rl.close();
          resolve(answer)
        });
      })
    }

  async function waitForNetworkIdle(page, timeout, maxInflightRequests = 0) {
  console.log('waitForNetworkIdle called')
  page.on('request', onRequestStarted);
  page.on('requestfinished', onRequestFinished);
  page.on('requestfailed', onRequestFinished);

  let inflight = 0;
  let fulfill;
  let promise = new Promise(x => fulfill = x);
  let timeoutId = setTimeout(onTimeoutDone, timeout);
  return promise;

  function onTimeoutDone() {
    page.removeListener('request', onRequestStarted);
    page.removeListener('requestfinished', onRequestFinished);
    page.removeListener('requestfailed', onRequestFinished);
    fulfill();
  }

  function onRequestStarted() {
    ++inflight;
    if (inflight > maxInflightRequests)
      clearTimeout(timeoutId);
  }

  function onRequestFinished() {
    if (inflight === 0)
      return;
    --inflight;
    if (inflight === maxInflightRequests)
      timeoutId = setTimeout(onTimeoutDone, timeout);
  }
}


  await browser.close()
}
run();

请注意"解决验证码并输入是以继续:"方法不按预期工作,需要一些修复.

编辑:10分钟后重新运行机器人,再次获得CAPTCHA.在chrome://inspect/#devices上解决了CAPTCHA重新启动了机器人,一切都再次工作.没有IP禁令.

其他推荐答案

可以帮助的东西:

  • 标题应类似于常见浏览器,包括:
  • 如果要多个请求,请在它们之间放置随机超时
  • 如果您在页面中找到的链接,请相应地设置推荐标题
  • 图像应该启用
  • javascript 应启用
    • 检查" navigator.plugins "和" navigator.language "在客户端JavaScript页面上下文
    • 中设置
  • 使用代理

其他推荐答案

如果您认为网站透视图,您确实是可疑的工作.因此,无论何时要绕过这样的东西,都肯定会思考他们是如何思考的.

设置cookie正确

puppeteer和phantomjs等将使用真实的浏览器,并且使用的cookie比通过邮递员或使用时更好.你只需要正确使用cookie.

您可以使用page.setCookie(...cookies)设置cookie. cookie是序列化的,因此如果cookie是一个对象数组,则可以简单地执行此操作,

const cookies = [{name: 'test', value: 'foo'}, {name: 'test2', value: 'foo'}]; // just as example, use real cookies here;
await page.setCookie(...cookies);

尝试调整行为

关闭无头模式并查看网站的行为.

await puppeteer.launch({headless: false})

尝试代理

某些网站监视基于IP地址,如果多次命中来自相同IP,则会阻止请求.最好在这种情况下使用旋转代理.

本文地址:https://www.itbaoku.cn/post/1739929.html