[{"data":1,"prerenderedAt":1162},["ShallowReactive",2],{"page-/post/spider/puppeteer-jujin-user-info":3,"surrounding-page":1153},{"id":4,"title":5,"author":6,"body":7,"date":1138,"description":5,"extension":1139,"group":6,"lastmod":1140,"meta":1141,"navigation":1144,"path":1145,"rawbody":1146,"seo":1147,"showTitle":5,"stem":1148,"tags":1149,"versions":6,"__hash__":1152},"content/post/spider/puppeteer-jujin-user-info.md","使用puppeteer爬取掘金个人信息",null,{"type":8,"value":9,"toc":1129},"minimark",[10,14,18,21,24,27,39,42,45,48,51,114,117,244,247,295,316,399,402,408,411,414,422,425,428,431,434,437,1067,1070,1073,1076,1080,1083,1086,1089,1092,1095,1098,1101,1104,1107,1110,1113,1116,1119,1122,1125],[11,12,13],"h2",{"id":13},"引言",[15,16,17],"p",{},"上一篇文章《使用puppeteer爬取掘金热榜》里，用了puppeteer的一些基础语法就完成了数据的爬取，这种可见即可爬的方式对于普通的使用者在感觉上来说，还是非常可靠和实用的。这次依旧是选择爬掘金的个人信息，但绕开了最麻烦的一步。",[11,19,20],{"id":20},"绕过登录",[15,22,23],{},"登录里比较麻烦就是验证码，有滑块、数字、数学计算等等多种多样的。",[15,25,26],{},"由于爬取的目的一般来说只是不太方便打开目标网站，或者要关注的网站太多，需要聚合一下每天刷一刷，所以对爬取的速度并没有太多要求。比较重要的是拿到数据后，如何进行可视化，所以就又回到了前端界面优化的问题上了。",[15,28,29,30,34,35,38],{},"所以我这里选择手动登录，并且之前使用的",[31,32,33],"code",{},"puppeteer","这个库，现在换成了",[31,36,37],{},"puppeteer-core","，我用它来控制现有的chrome浏览器。",[15,40,41],{},"还有一个隐形的好处。比如有一个好朋友也有类似的需求，而他不懂技术，把程序打包成二进制给他之后，他能看着浏览器一步步的操作，反而会感觉更安心一些🤔",[15,43,44],{},"开始改造一下上次写的代码。",[11,46,47],{"id":47},"连接现有浏览器",[15,49,50],{},"因为不用默认内置的浏览器了，所以需要先打开自己的chrome浏览器，然后获取到浏览器的调试信息，再进行连接",[52,53,58],"pre",{"className":54,"code":55,"language":56,"meta":57,"style":57},"language-typescript shiki shiki-themes github-light","const browser = await puppeteer.connect({\n      slowMo: 50,\n      browserWSEndpoint: address,\n    });\n","typescript","",[31,59,60,90,102,108],{"__ignoreMap":57},[61,62,65,69,73,76,79,83,87],"span",{"class":63,"line":64},"line",1,[61,66,68],{"class":67},"sD7c4","const",[61,70,72],{"class":71},"sYu0t"," browser",[61,74,75],{"class":67}," =",[61,77,78],{"class":67}," await",[61,80,82],{"class":81},"sgsFI"," puppeteer.",[61,84,86],{"class":85},"s7eDp","connect",[61,88,89],{"class":81},"({\n",[61,91,93,96,99],{"class":63,"line":92},2,[61,94,95],{"class":81},"      slowMo: ",[61,97,98],{"class":71},"50",[61,100,101],{"class":81},",\n",[61,103,105],{"class":63,"line":104},3,[61,106,107],{"class":81},"      browserWSEndpoint: address,\n",[61,109,111],{"class":63,"line":110},4,[61,112,113],{"class":81},"    });\n",[15,115,116],{},"获取address前，需要先用debug模式启动chrome，以macos为例，启动一个9222的端口号",[52,118,120],{"className":54,"code":119,"language":56,"meta":57,"style":57},"/Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome  --remote-debugging-port=9222 --no-first-run --no-default-browser-check --user-data-dir=$(mktemp -d -t 'your_chrome_data_dir') \n",[31,121,122],{"__ignoreMap":57},[61,123,124,127,130,132,135,137,140,142,145,147,150,153,156,159,162,164,167,170,173,176,179,181,184,186,189,191,193,196,199,201,204,206,209,211,214,216,219,221,224,227,229,232,234,237,241],{"class":63,"line":64},[61,125,126],{"class":67},"/",[61,128,129],{"class":81},"Applications",[61,131,126],{"class":67},[61,133,134],{"class":81},"Google\\ Chrome.app",[61,136,126],{"class":67},[61,138,139],{"class":81},"Contents",[61,141,126],{"class":67},[61,143,144],{"class":81},"MacOS",[61,146,126],{"class":67},[61,148,149],{"class":81},"Google\\ Chrome  ",[61,151,152],{"class":67},"--",[61,154,155],{"class":81},"remote",[61,157,158],{"class":67},"-",[61,160,161],{"class":81},"debugging",[61,163,158],{"class":67},[61,165,166],{"class":81},"port",[61,168,169],{"class":67},"=",[61,171,172],{"class":71},"9222",[61,174,175],{"class":67}," --",[61,177,178],{"class":81},"no",[61,180,158],{"class":67},[61,182,183],{"class":81},"first",[61,185,158],{"class":67},[61,187,188],{"class":81},"run ",[61,190,152],{"class":67},[61,192,178],{"class":81},[61,194,195],{"class":67},"-default-",[61,197,198],{"class":81},"browser",[61,200,158],{"class":67},[61,202,203],{"class":81},"check ",[61,205,152],{"class":67},[61,207,208],{"class":81},"user",[61,210,158],{"class":67},[61,212,213],{"class":81},"data",[61,215,158],{"class":67},[61,217,218],{"class":81},"dir",[61,220,169],{"class":67},[61,222,223],{"class":85},"$",[61,225,226],{"class":81},"(mktemp ",[61,228,158],{"class":67},[61,230,231],{"class":81},"d ",[61,233,158],{"class":67},[61,235,236],{"class":81},"t ",[61,238,240],{"class":239},"sYBdl","'your_chrome_data_dir'",[61,242,243],{"class":81},")\n",[15,245,246],{},"如果是windows，dir路径需要是一个存在的路径",[52,248,250],{"className":54,"code":249,"language":56,"meta":57,"style":57},"文件路径/chrome.exe --remote-debugging-port=9222 --user-data-dir=\"your_chrome_data_dir\"\n\n\n",[31,251,252],{"__ignoreMap":57},[61,253,254,257,259,262,264,266,268,270,272,274,276,278,280,282,284,286,288,290,292],{"class":63,"line":64},[61,255,256],{"class":81},"文件路径",[61,258,126],{"class":67},[61,260,261],{"class":81},"chrome.exe ",[61,263,152],{"class":67},[61,265,155],{"class":81},[61,267,158],{"class":67},[61,269,161],{"class":81},[61,271,158],{"class":67},[61,273,166],{"class":81},[61,275,169],{"class":67},[61,277,172],{"class":71},[61,279,175],{"class":67},[61,281,208],{"class":81},[61,283,158],{"class":67},[61,285,213],{"class":81},[61,287,158],{"class":67},[61,289,218],{"class":81},[61,291,169],{"class":67},[61,293,294],{"class":239},"\"your_chrome_data_dir\"\n",[15,296,297,298,301,302,308,309,312,313],{},"启动成功后可以通过",[31,299,300],{},"GET","  ",[303,304,305],"a",{"href":305,"rel":306,"title":305},"http://127.0.0.1:9222/json/version",[307],"nofollow"," 这个地址获取到",[31,310,311],{},"webSocketDebuggerUrl","，也就是上边的",[31,314,315],{},"address",[52,317,319],{"className":54,"code":318,"language":56,"meta":57,"style":57},"const data = await axios\n      .get('http://127.0.0.1:9222/json/version')\n      .catch((err) => {\n        this.logger.error(`未找到已启动的chrome浏览器1`);\n      });\n",[31,320,321,335,351,374,393],{"__ignoreMap":57},[61,322,323,325,328,330,332],{"class":63,"line":64},[61,324,68],{"class":67},[61,326,327],{"class":71}," data",[61,329,75],{"class":67},[61,331,78],{"class":67},[61,333,334],{"class":81}," axios\n",[61,336,337,340,343,346,349],{"class":63,"line":92},[61,338,339],{"class":81},"      .",[61,341,342],{"class":85},"get",[61,344,345],{"class":81},"(",[61,347,348],{"class":239},"'http://127.0.0.1:9222/json/version'",[61,350,243],{"class":81},[61,352,353,355,358,361,365,368,371],{"class":63,"line":104},[61,354,339],{"class":81},[61,356,357],{"class":85},"catch",[61,359,360],{"class":81},"((",[61,362,364],{"class":363},"sqxcx","err",[61,366,367],{"class":81},") ",[61,369,370],{"class":67},"=>",[61,372,373],{"class":81}," {\n",[61,375,376,379,382,385,387,390],{"class":63,"line":110},[61,377,378],{"class":71},"        this",[61,380,381],{"class":81},".logger.",[61,383,384],{"class":85},"error",[61,386,345],{"class":81},[61,388,389],{"class":239},"`未找到已启动的chrome浏览器1`",[61,391,392],{"class":81},");\n",[61,394,396],{"class":63,"line":395},5,[61,397,398],{"class":81},"      });\n",[15,400,401],{},"先获取webSocketDebuggerUrl， 如果获取成功，使用puppeteer-core连接，如果获取失败，提示出来，不再执行。",[15,403,404,407],{},[31,405,406],{},"slowMo","可以控制脚本操作chrome的时候慢一些，方便观察。",[11,409,410],{"id":410},"自行登录",[15,412,413],{},"连接成功后，自己可以使用任意方式登录。",[15,415,416,417,421],{},"但是有一个很大的问题，启动后的",[418,419,420],"strong",{},"命令行界面和浏览器，都不能关闭","。不然登录状态就没了，再重新打开后又得登录一遍。如果爬取的大部分网站不需要登录还好，如果都要登录的话，那还是得想办法自动登录一下。",[15,423,424],{},"打码平台有很多，云码、超级鹰等都可以自己对接，不过大部分都是要钱的。也可以自己找找github上有没有开源项目，接入一下。或者等我找到之后，再来看我的😎",[15,426,427],{},"最小化窗口没有影响，跑起来还是会自己打开。",[11,429,430],{"id":430},"开始爬取",[15,432,433],{},"以下内容，和爬取热榜大同小异，有需要的自取。 ",[15,435,436],{},"获取css selector的方式也是用的上篇文章的方式。",[52,438,440],{"className":54,"code":439,"language":56,"meta":57,"style":57},"// 等待数据展示区域展示出来\n    await page.waitForSelector(\n      '#juejin > div.view-container > main > div > div.minor-area > div > div.stat-block.block.shadow > div.block-body ',\n    );\n    // 等待头像加载出来, 头像出来了, 右侧信息肯定都有了\n    await page.waitForSelector(\n      '#juejin > div.view-container > main > div > div.major-area > div.user-info-block.block.shadow > div.avatar.jj-avatar > img',\n    );\n    // 文章被点赞数\n    const articleUpvote = await page.$eval(\n      '#juejin > div.view-container > main > div > div.minor-area > div > div.stat-block.block.shadow > div.block-body > div:nth-child(1) > span > span',\n      (el) => el.textContent,\n    );\n    console.log(`文章被点赞数`, articleUpvote);\n    // 文章被阅读数\n    const articleViewNumber = await page.$eval(\n      '#juejin > div.view-container > main > div > div.minor-area > div > div.stat-block.block.shadow > div.block-body > div:nth-child(2) > span > span',\n      (el) => el.textContent,\n    );\n    console.log(`文章被阅读数`, articleViewNumber);\n    // 文章被阅读数\n    const articleJueNumber = await page.$eval(\n      '#juejin > div.view-container > main > div > div.minor-area > div > div.stat-block.block.shadow > div.block-body > a > span > span',\n      (el) => el.textContent,\n    );\n    console.log(`掘力值`, articleJueNumber);\n    // 获取头像\n    const avatarUrl = await page.$eval(\n      '#juejin > div.view-container > main > div > div.major-area > div.user-info-block.block.shadow > div.avatar.jj-avatar > img',\n      (el) => el.getAttribute('src'),\n    );\n    console.log(`头像地址`, avatarUrl);\n    // 获取用户名\n    const userName = await page.$eval(\n      '#juejin > div.view-container > main > div > div.major-area > div.user-info-block.block.shadow > div.info-box.info-box > div.top > div.left > h1 > span',\n      (el) => el.textContent,\n    );\n    console.log(`用户名`, userName);\n    const position = await page.$eval(\n      '#juejin > div.view-container > main > div > div.major-area > div.user-info-block.block.shadow > div.info-box.info-box > div.introduction > div.left > div.position > span > span:nth-child(1)',\n      (el) => el.textContent,\n    );\n    console.log(`职位`, position);\n    const company = await page.$eval(\n      '#juejin > div.view-container > main > div > div.major-area > div.user-info-block.block.shadow > div.info-box.info-box > div.introduction > div.left > div.position > span > span:nth-child(3)',\n      (el) => el.textContent,\n    );\n    console.log(`公司`, company);\n    const intro = await page.$eval(\n      '#juejin > div.view-container > main > div > div.major-area > div.user-info-block.block.shadow > div.info-box.info-box > div.introduction > div.left > div.intro > span',\n      (el) => el.textContent,\n    );\n    console.log(`个人简介`, intro);\n    let fansNumber = await page.$eval(\n      '#juejin > div.view-container > main > div > div.minor-area > div > div.follow-block.block.shadow > a:nth-child(2) > div.item-count',\n      (el) => el.textContent,\n    );\n\n",[31,441,442,448,462,469,474,479,490,498,503,509,529,537,553,558,575,581,599,607,620,625,640,645,663,671,684,689,704,710,728,735,760,765,780,786,804,812,825,830,845,863,871,884,889,904,922,930,943,948,963,981,989,1002,1007,1022,1041,1049,1062],{"__ignoreMap":57},[61,443,444],{"class":63,"line":64},[61,445,447],{"class":446},"sAwPA","// 等待数据展示区域展示出来\n",[61,449,450,453,456,459],{"class":63,"line":92},[61,451,452],{"class":67},"    await",[61,454,455],{"class":81}," page.",[61,457,458],{"class":85},"waitForSelector",[61,460,461],{"class":81},"(\n",[61,463,464,467],{"class":63,"line":104},[61,465,466],{"class":239},"      '#juejin > div.view-container > main > div > div.minor-area > div > div.stat-block.block.shadow > div.block-body '",[61,468,101],{"class":81},[61,470,471],{"class":63,"line":110},[61,472,473],{"class":81},"    );\n",[61,475,476],{"class":63,"line":395},[61,477,478],{"class":446},"    // 等待头像加载出来, 头像出来了, 右侧信息肯定都有了\n",[61,480,482,484,486,488],{"class":63,"line":481},6,[61,483,452],{"class":67},[61,485,455],{"class":81},[61,487,458],{"class":85},[61,489,461],{"class":81},[61,491,493,496],{"class":63,"line":492},7,[61,494,495],{"class":239},"      '#juejin > div.view-container > main > div > div.major-area > div.user-info-block.block.shadow > div.avatar.jj-avatar > img'",[61,497,101],{"class":81},[61,499,501],{"class":63,"line":500},8,[61,502,473],{"class":81},[61,504,506],{"class":63,"line":505},9,[61,507,508],{"class":446},"    // 文章被点赞数\n",[61,510,512,515,518,520,522,524,527],{"class":63,"line":511},10,[61,513,514],{"class":67},"    const",[61,516,517],{"class":71}," articleUpvote",[61,519,75],{"class":67},[61,521,78],{"class":67},[61,523,455],{"class":81},[61,525,526],{"class":85},"$eval",[61,528,461],{"class":81},[61,530,532,535],{"class":63,"line":531},11,[61,533,534],{"class":239},"      '#juejin > div.view-container > main > div > div.minor-area > div > div.stat-block.block.shadow > div.block-body > div:nth-child(1) > span > span'",[61,536,101],{"class":81},[61,538,540,543,546,548,550],{"class":63,"line":539},12,[61,541,542],{"class":81},"      (",[61,544,545],{"class":363},"el",[61,547,367],{"class":81},[61,549,370],{"class":67},[61,551,552],{"class":81}," el.textContent,\n",[61,554,556],{"class":63,"line":555},13,[61,557,473],{"class":81},[61,559,561,564,567,569,572],{"class":63,"line":560},14,[61,562,563],{"class":81},"    console.",[61,565,566],{"class":85},"log",[61,568,345],{"class":81},[61,570,571],{"class":239},"`文章被点赞数`",[61,573,574],{"class":81},", articleUpvote);\n",[61,576,578],{"class":63,"line":577},15,[61,579,580],{"class":446},"    // 文章被阅读数\n",[61,582,584,586,589,591,593,595,597],{"class":63,"line":583},16,[61,585,514],{"class":67},[61,587,588],{"class":71}," articleViewNumber",[61,590,75],{"class":67},[61,592,78],{"class":67},[61,594,455],{"class":81},[61,596,526],{"class":85},[61,598,461],{"class":81},[61,600,602,605],{"class":63,"line":601},17,[61,603,604],{"class":239},"      '#juejin > div.view-container > main > div > div.minor-area > div > div.stat-block.block.shadow > div.block-body > div:nth-child(2) > span > span'",[61,606,101],{"class":81},[61,608,610,612,614,616,618],{"class":63,"line":609},18,[61,611,542],{"class":81},[61,613,545],{"class":363},[61,615,367],{"class":81},[61,617,370],{"class":67},[61,619,552],{"class":81},[61,621,623],{"class":63,"line":622},19,[61,624,473],{"class":81},[61,626,628,630,632,634,637],{"class":63,"line":627},20,[61,629,563],{"class":81},[61,631,566],{"class":85},[61,633,345],{"class":81},[61,635,636],{"class":239},"`文章被阅读数`",[61,638,639],{"class":81},", articleViewNumber);\n",[61,641,643],{"class":63,"line":642},21,[61,644,580],{"class":446},[61,646,648,650,653,655,657,659,661],{"class":63,"line":647},22,[61,649,514],{"class":67},[61,651,652],{"class":71}," articleJueNumber",[61,654,75],{"class":67},[61,656,78],{"class":67},[61,658,455],{"class":81},[61,660,526],{"class":85},[61,662,461],{"class":81},[61,664,666,669],{"class":63,"line":665},23,[61,667,668],{"class":239},"      '#juejin > div.view-container > main > div > div.minor-area > div > div.stat-block.block.shadow > div.block-body > a > span > span'",[61,670,101],{"class":81},[61,672,674,676,678,680,682],{"class":63,"line":673},24,[61,675,542],{"class":81},[61,677,545],{"class":363},[61,679,367],{"class":81},[61,681,370],{"class":67},[61,683,552],{"class":81},[61,685,687],{"class":63,"line":686},25,[61,688,473],{"class":81},[61,690,692,694,696,698,701],{"class":63,"line":691},26,[61,693,563],{"class":81},[61,695,566],{"class":85},[61,697,345],{"class":81},[61,699,700],{"class":239},"`掘力值`",[61,702,703],{"class":81},", articleJueNumber);\n",[61,705,707],{"class":63,"line":706},27,[61,708,709],{"class":446},"    // 获取头像\n",[61,711,713,715,718,720,722,724,726],{"class":63,"line":712},28,[61,714,514],{"class":67},[61,716,717],{"class":71}," avatarUrl",[61,719,75],{"class":67},[61,721,78],{"class":67},[61,723,455],{"class":81},[61,725,526],{"class":85},[61,727,461],{"class":81},[61,729,731,733],{"class":63,"line":730},29,[61,732,495],{"class":239},[61,734,101],{"class":81},[61,736,738,740,742,744,746,749,752,754,757],{"class":63,"line":737},30,[61,739,542],{"class":81},[61,741,545],{"class":363},[61,743,367],{"class":81},[61,745,370],{"class":67},[61,747,748],{"class":81}," el.",[61,750,751],{"class":85},"getAttribute",[61,753,345],{"class":81},[61,755,756],{"class":239},"'src'",[61,758,759],{"class":81},"),\n",[61,761,763],{"class":63,"line":762},31,[61,764,473],{"class":81},[61,766,768,770,772,774,777],{"class":63,"line":767},32,[61,769,563],{"class":81},[61,771,566],{"class":85},[61,773,345],{"class":81},[61,775,776],{"class":239},"`头像地址`",[61,778,779],{"class":81},", avatarUrl);\n",[61,781,783],{"class":63,"line":782},33,[61,784,785],{"class":446},"    // 获取用户名\n",[61,787,789,791,794,796,798,800,802],{"class":63,"line":788},34,[61,790,514],{"class":67},[61,792,793],{"class":71}," userName",[61,795,75],{"class":67},[61,797,78],{"class":67},[61,799,455],{"class":81},[61,801,526],{"class":85},[61,803,461],{"class":81},[61,805,807,810],{"class":63,"line":806},35,[61,808,809],{"class":239},"      '#juejin > div.view-container > main > div > div.major-area > div.user-info-block.block.shadow > div.info-box.info-box > div.top > div.left > h1 > span'",[61,811,101],{"class":81},[61,813,815,817,819,821,823],{"class":63,"line":814},36,[61,816,542],{"class":81},[61,818,545],{"class":363},[61,820,367],{"class":81},[61,822,370],{"class":67},[61,824,552],{"class":81},[61,826,828],{"class":63,"line":827},37,[61,829,473],{"class":81},[61,831,833,835,837,839,842],{"class":63,"line":832},38,[61,834,563],{"class":81},[61,836,566],{"class":85},[61,838,345],{"class":81},[61,840,841],{"class":239},"`用户名`",[61,843,844],{"class":81},", userName);\n",[61,846,848,850,853,855,857,859,861],{"class":63,"line":847},39,[61,849,514],{"class":67},[61,851,852],{"class":71}," position",[61,854,75],{"class":67},[61,856,78],{"class":67},[61,858,455],{"class":81},[61,860,526],{"class":85},[61,862,461],{"class":81},[61,864,866,869],{"class":63,"line":865},40,[61,867,868],{"class":239},"      '#juejin > div.view-container > main > div > div.major-area > div.user-info-block.block.shadow > div.info-box.info-box > div.introduction > div.left > div.position > span > span:nth-child(1)'",[61,870,101],{"class":81},[61,872,874,876,878,880,882],{"class":63,"line":873},41,[61,875,542],{"class":81},[61,877,545],{"class":363},[61,879,367],{"class":81},[61,881,370],{"class":67},[61,883,552],{"class":81},[61,885,887],{"class":63,"line":886},42,[61,888,473],{"class":81},[61,890,892,894,896,898,901],{"class":63,"line":891},43,[61,893,563],{"class":81},[61,895,566],{"class":85},[61,897,345],{"class":81},[61,899,900],{"class":239},"`职位`",[61,902,903],{"class":81},", position);\n",[61,905,907,909,912,914,916,918,920],{"class":63,"line":906},44,[61,908,514],{"class":67},[61,910,911],{"class":71}," company",[61,913,75],{"class":67},[61,915,78],{"class":67},[61,917,455],{"class":81},[61,919,526],{"class":85},[61,921,461],{"class":81},[61,923,925,928],{"class":63,"line":924},45,[61,926,927],{"class":239},"      '#juejin > div.view-container > main > div > div.major-area > div.user-info-block.block.shadow > div.info-box.info-box > div.introduction > div.left > div.position > span > span:nth-child(3)'",[61,929,101],{"class":81},[61,931,933,935,937,939,941],{"class":63,"line":932},46,[61,934,542],{"class":81},[61,936,545],{"class":363},[61,938,367],{"class":81},[61,940,370],{"class":67},[61,942,552],{"class":81},[61,944,946],{"class":63,"line":945},47,[61,947,473],{"class":81},[61,949,951,953,955,957,960],{"class":63,"line":950},48,[61,952,563],{"class":81},[61,954,566],{"class":85},[61,956,345],{"class":81},[61,958,959],{"class":239},"`公司`",[61,961,962],{"class":81},", company);\n",[61,964,966,968,971,973,975,977,979],{"class":63,"line":965},49,[61,967,514],{"class":67},[61,969,970],{"class":71}," intro",[61,972,75],{"class":67},[61,974,78],{"class":67},[61,976,455],{"class":81},[61,978,526],{"class":85},[61,980,461],{"class":81},[61,982,984,987],{"class":63,"line":983},50,[61,985,986],{"class":239},"      '#juejin > div.view-container > main > div > div.major-area > div.user-info-block.block.shadow > div.info-box.info-box > div.introduction > div.left > div.intro > span'",[61,988,101],{"class":81},[61,990,992,994,996,998,1000],{"class":63,"line":991},51,[61,993,542],{"class":81},[61,995,545],{"class":363},[61,997,367],{"class":81},[61,999,370],{"class":67},[61,1001,552],{"class":81},[61,1003,1005],{"class":63,"line":1004},52,[61,1006,473],{"class":81},[61,1008,1010,1012,1014,1016,1019],{"class":63,"line":1009},53,[61,1011,563],{"class":81},[61,1013,566],{"class":85},[61,1015,345],{"class":81},[61,1017,1018],{"class":239},"`个人简介`",[61,1020,1021],{"class":81},", intro);\n",[61,1023,1025,1028,1031,1033,1035,1037,1039],{"class":63,"line":1024},54,[61,1026,1027],{"class":67},"    let",[61,1029,1030],{"class":81}," fansNumber ",[61,1032,169],{"class":67},[61,1034,78],{"class":67},[61,1036,455],{"class":81},[61,1038,526],{"class":85},[61,1040,461],{"class":81},[61,1042,1044,1047],{"class":63,"line":1043},55,[61,1045,1046],{"class":239},"      '#juejin > div.view-container > main > div > div.minor-area > div > div.follow-block.block.shadow > a:nth-child(2) > div.item-count'",[61,1048,101],{"class":81},[61,1050,1052,1054,1056,1058,1060],{"class":63,"line":1051},56,[61,1053,542],{"class":81},[61,1055,545],{"class":363},[61,1057,367],{"class":81},[61,1059,370],{"class":67},[61,1061,552],{"class":81},[61,1063,1065],{"class":63,"line":1064},57,[61,1066,473],{"class":81},[15,1068,1069],{},"信息拿到之后，重点是要干啥用。",[11,1071,1072],{"id":1072},"信息处理",[15,1074,1075],{},"我提供几种思路，大家酌情考虑。",[1077,1078,1079],"h4",{"id":1079},"webhook通知",[15,1081,1082],{},"用nestjs里的定时任务，每天自动跑一下，拿到数据之后，用钉钉/飞书的webhook直接发出来，早上看看有无数据的变动或者消息。",[1077,1084,1085],{"id":1085},"数据趋势可视化",[15,1087,1088],{},"可以把每天的文章数据、粉丝数据、阅读量等记录一下，生成趋势图，看起来更直观一些。",[15,1090,1091],{},"也可以看到自己关注的博主最新更新的信息，计算一下他有几天没有更新了🤔",[15,1093,1094],{},"思路再打开一些，不一定是技术站点，其他任何自己感兴趣的网站都可以爬下来，做一些直观又好看的趋势图。",[1077,1096,1097],{"id":1097},"名片类可视化",[15,1099,1100],{},"可以把自己的数据，和平时自己看的动漫、电影、游戏等面板结合起来。",[15,1102,1103],{},"比如我能想到的就是JOJO的面板，搜了一圈，目前还没啥可用的网站。",[15,1105,1106],{},"由于游戏已经很久没玩了，从最开始的英雄联盟、守望先锋、PUBG后来就几乎不玩了，灵感已经枯竭。这两天帕兽大火，我才开了一下机，发现steam密码早忘了，然后账号密码找回，让我贴一张支付截图，我支付宝账单都找到17年去了🥲",[11,1108,1109],{"id":1109},"小结",[15,1111,1112],{},"![[1-img-20241119141155.png]]",[15,1114,1115],{},"认知这个框的，估计也快30了吧😏",[15,1117,1118],{},"以上就是全部内容啦，没啥新鲜东西，有时间再慢慢做吧。",[15,1120,1121],{},"最近摸鱼环境不容乐观，闲里偷忙的机会越来越少啦！！",[15,1123,1124],{},"我是枣把儿，欢迎关注我的公众号：早早集市，来找我玩耍🥳",[1126,1127,1128],"style",{},"html pre.shiki code .sD7c4, html code.shiki .sD7c4{--shiki-default:#D73A49}html pre.shiki code .sYu0t, html code.shiki .sYu0t{--shiki-default:#005CC5}html pre.shiki code .sgsFI, html code.shiki .sgsFI{--shiki-default:#24292E}html pre.shiki code .s7eDp, html code.shiki .s7eDp{--shiki-default:#6F42C1}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sYBdl, html code.shiki .sYBdl{--shiki-default:#032F62}html pre.shiki code .sqxcx, html code.shiki .sqxcx{--shiki-default:#E36209}html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}",{"title":57,"searchDepth":92,"depth":92,"links":1130},[1131,1132,1133,1134,1135,1136,1137],{"id":13,"depth":92,"text":13},{"id":20,"depth":92,"text":20},{"id":47,"depth":92,"text":47},{"id":410,"depth":92,"text":410},{"id":430,"depth":92,"text":430},{"id":1072,"depth":92,"text":1072},{"id":1109,"depth":92,"text":1109},"2024-01-29T00:00:00.000Z","md","2025-08-19T00:00:00.000Z",{"published":1142,"category":1143},"2023-12-11 00:00:00","技术",true,"/post/spider/puppeteer-jujin-user-info","---\npublished: 2023-12-11 00:00:00\ntitle: 使用puppeteer爬取掘金个人信息\ndescription: 使用puppeteer爬取掘金个人信息\ntags: [\"技术\", \"Puppeteer\", \"Node\"]\ncategory: 技术\ndate: 2024-01-29\nlastmod: 2025-08-19\nshowTitle: 使用puppeteer爬取掘金个人信息\n---\n## 引言\n\n上一篇文章《使用puppeteer爬取掘金热榜》里，用了puppeteer的一些基础语法就完成了数据的爬取，这种可见即可爬的方式对于普通的使用者在感觉上来说，还是非常可靠和实用的。这次依旧是选择爬掘金的个人信息，但绕开了最麻烦的一步。\n\n## 绕过登录\n\n登录里比较麻烦就是验证码，有滑块、数字、数学计算等等多种多样的。\n\n由于爬取的目的一般来说只是不太方便打开目标网站，或者要关注的网站太多，需要聚合一下每天刷一刷，所以对爬取的速度并没有太多要求。比较重要的是拿到数据后，如何进行可视化，所以就又回到了前端界面优化的问题上了。\n\n所以我这里选择手动登录，并且之前使用的`puppeteer`这个库，现在换成了`puppeteer-core`，我用它来控制现有的chrome浏览器。\n\n还有一个隐形的好处。比如有一个好朋友也有类似的需求，而他不懂技术，把程序打包成二进制给他之后，他能看着浏览器一步步的操作，反而会感觉更安心一些🤔\n\n开始改造一下上次写的代码。\n\n## 连接现有浏览器\n\n因为不用默认内置的浏览器了，所以需要先打开自己的chrome浏览器，然后获取到浏览器的调试信息，再进行连接\n\n```typescript\nconst browser = await puppeteer.connect({\n      slowMo: 50,\n      browserWSEndpoint: address,\n    });\n```\n\n获取address前，需要先用debug模式启动chrome，以macos为例，启动一个9222的端口号\n\n```typescript\n/Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome  --remote-debugging-port=9222 --no-first-run --no-default-browser-check --user-data-dir=$(mktemp -d -t 'your_chrome_data_dir') \n```\n\n如果是windows，dir路径需要是一个存在的路径\n\n```typescript\n文件路径/chrome.exe --remote-debugging-port=9222 --user-data-dir=\"your_chrome_data_dir\"\n\n\n```\n\n启动成功后可以通过`GET`  [http://127.0.0.1:9222/json/version](http://127.0.0.1:9222/json/version \"http://127.0.0.1:9222/json/version\") 这个地址获取到`webSocketDebuggerUrl`，也就是上边的`address`\n\n```typescript\nconst data = await axios\n      .get('http://127.0.0.1:9222/json/version')\n      .catch((err) => {\n        this.logger.error(`未找到已启动的chrome浏览器1`);\n      });\n```\n\n先获取webSocketDebuggerUrl， 如果获取成功，使用puppeteer-core连接，如果获取失败，提示出来，不再执行。\n\n`slowMo`可以控制脚本操作chrome的时候慢一些，方便观察。\n\n## 自行登录\n\n连接成功后，自己可以使用任意方式登录。\n\n但是有一个很大的问题，启动后的**命令行界面和浏览器，都不能关闭**。不然登录状态就没了，再重新打开后又得登录一遍。如果爬取的大部分网站不需要登录还好，如果都要登录的话，那还是得想办法自动登录一下。\n\n打码平台有很多，云码、超级鹰等都可以自己对接，不过大部分都是要钱的。也可以自己找找github上有没有开源项目，接入一下。或者等我找到之后，再来看我的😎\n\n最小化窗口没有影响，跑起来还是会自己打开。\n\n## 开始爬取\n\n以下内容，和爬取热榜大同小异，有需要的自取。&#x20;\n\n获取css selector的方式也是用的上篇文章的方式。\n\n```typescript\n// 等待数据展示区域展示出来\n    await page.waitForSelector(\n      '#juejin > div.view-container > main > div > div.minor-area > div > div.stat-block.block.shadow > div.block-body ',\n    );\n    // 等待头像加载出来, 头像出来了, 右侧信息肯定都有了\n    await page.waitForSelector(\n      '#juejin > div.view-container > main > div > div.major-area > div.user-info-block.block.shadow > div.avatar.jj-avatar > img',\n    );\n    // 文章被点赞数\n    const articleUpvote = await page.$eval(\n      '#juejin > div.view-container > main > div > div.minor-area > div > div.stat-block.block.shadow > div.block-body > div:nth-child(1) > span > span',\n      (el) => el.textContent,\n    );\n    console.log(`文章被点赞数`, articleUpvote);\n    // 文章被阅读数\n    const articleViewNumber = await page.$eval(\n      '#juejin > div.view-container > main > div > div.minor-area > div > div.stat-block.block.shadow > div.block-body > div:nth-child(2) > span > span',\n      (el) => el.textContent,\n    );\n    console.log(`文章被阅读数`, articleViewNumber);\n    // 文章被阅读数\n    const articleJueNumber = await page.$eval(\n      '#juejin > div.view-container > main > div > div.minor-area > div > div.stat-block.block.shadow > div.block-body > a > span > span',\n      (el) => el.textContent,\n    );\n    console.log(`掘力值`, articleJueNumber);\n    // 获取头像\n    const avatarUrl = await page.$eval(\n      '#juejin > div.view-container > main > div > div.major-area > div.user-info-block.block.shadow > div.avatar.jj-avatar > img',\n      (el) => el.getAttribute('src'),\n    );\n    console.log(`头像地址`, avatarUrl);\n    // 获取用户名\n    const userName = await page.$eval(\n      '#juejin > div.view-container > main > div > div.major-area > div.user-info-block.block.shadow > div.info-box.info-box > div.top > div.left > h1 > span',\n      (el) => el.textContent,\n    );\n    console.log(`用户名`, userName);\n    const position = await page.$eval(\n      '#juejin > div.view-container > main > div > div.major-area > div.user-info-block.block.shadow > div.info-box.info-box > div.introduction > div.left > div.position > span > span:nth-child(1)',\n      (el) => el.textContent,\n    );\n    console.log(`职位`, position);\n    const company = await page.$eval(\n      '#juejin > div.view-container > main > div > div.major-area > div.user-info-block.block.shadow > div.info-box.info-box > div.introduction > div.left > div.position > span > span:nth-child(3)',\n      (el) => el.textContent,\n    );\n    console.log(`公司`, company);\n    const intro = await page.$eval(\n      '#juejin > div.view-container > main > div > div.major-area > div.user-info-block.block.shadow > div.info-box.info-box > div.introduction > div.left > div.intro > span',\n      (el) => el.textContent,\n    );\n    console.log(`个人简介`, intro);\n    let fansNumber = await page.$eval(\n      '#juejin > div.view-container > main > div > div.minor-area > div > div.follow-block.block.shadow > a:nth-child(2) > div.item-count',\n      (el) => el.textContent,\n    );\n\n```\n\n信息拿到之后，重点是要干啥用。\n\n## 信息处理\n\n我提供几种思路，大家酌情考虑。\n\n#### webhook通知\n\n用nestjs里的定时任务，每天自动跑一下，拿到数据之后，用钉钉/飞书的webhook直接发出来，早上看看有无数据的变动或者消息。\n\n#### 数据趋势可视化\n\n可以把每天的文章数据、粉丝数据、阅读量等记录一下，生成趋势图，看起来更直观一些。\n\n也可以看到自己关注的博主最新更新的信息，计算一下他有几天没有更新了🤔\n\n思路再打开一些，不一定是技术站点，其他任何自己感兴趣的网站都可以爬下来，做一些直观又好看的趋势图。\n\n#### 名片类可视化\n\n可以把自己的数据，和平时自己看的动漫、电影、游戏等面板结合起来。\n\n比如我能想到的就是JOJO的面板，搜了一圈，目前还没啥可用的网站。\n\n由于游戏已经很久没玩了，从最开始的英雄联盟、守望先锋、PUBG后来就几乎不玩了，灵感已经枯竭。这两天帕兽大火，我才开了一下机，发现steam密码早忘了，然后账号密码找回，让我贴一张支付截图，我支付宝账单都找到17年去了🥲\n\n## 小结\n\n![[1-img-20241119141155.png]]\n\n认知这个框的，估计也快30了吧😏\n\n以上就是全部内容啦，没啥新鲜东西，有时间再慢慢做吧。\n\n最近摸鱼环境不容乐观，闲里偷忙的机会越来越少啦！！\n\n我是枣把儿，欢迎关注我的公众号：早早集市，来找我玩耍🥳\n",{"title":5,"description":5},"post/spider/puppeteer-jujin-user-info",[1143,1150,1151],"Puppeteer","Node","kKv0w6RZ5rZoA4VPAj_FyAR6FSVe29JJILadHTc1a88",[1154,1158],{"title":1155,"path":1156,"stem":1157},"OpenClaw 安装入门（Windows）","/post/zzao/openclaw/openclaw-install-windows","post/zzao/openclaw/openclaw-install-windows",{"title":1159,"path":1160,"stem":1161},"假设你是AI，你的Skill应该是什么样的","/post/zzao/ai-skill-structure","post/zzao/ai-skill-structure",1779005086936]