{"id":480,"date":"2023-11-03T18:32:25","date_gmt":"2023-11-03T10:32:25","guid":{"rendered":"http:\/\/ai.gitpp.com\/?p=480"},"modified":"2023-11-03T18:32:25","modified_gmt":"2023-11-03T10:32:25","slug":"github%e9%a1%b9%e7%9b%ae%e8%bf%9b%e9%98%b6%ef%bc%9a%e7%94%a8scrapy-%e5%92%8c-mongodb%e5%81%9a%e4%b8%80%e6%ac%be%e7%9c%9f%e5%ae%9e%e7%9a%84%e3%80%81%e5%8f%af%e7%94%a8%e7%9a%84%e3%80%81%e5%bc%80","status":"publish","type":"post","link":"http:\/\/ai.gitpp.com\/index.php\/2023\/11\/03\/github%e9%a1%b9%e7%9b%ae%e8%bf%9b%e9%98%b6%ef%bc%9a%e7%94%a8scrapy-%e5%92%8c-mongodb%e5%81%9a%e4%b8%80%e6%ac%be%e7%9c%9f%e5%ae%9e%e7%9a%84%e3%80%81%e5%8f%af%e7%94%a8%e7%9a%84%e3%80%81%e5%bc%80\/","title":{"rendered":"GitHub\u9879\u76ee\u8fdb\u9636\uff1a\u7528Scrapy \u548c MongoDB\u505a\u4e00\u6b3e\u771f\u5b9e\u7684\u3001\u53ef\u7528\u7684\u3001\u5f00\u6e90\u7684\u7f51\u7edc\u722c\u866b"},"content":{"rendered":"\n<p><\/p>\n\n\n\n<p><strong>Scrapy\u4ecb\u7ecd<\/strong><\/p>\n\n\n\n<p>Scrapy\u662f\u4e00\u4e2a\u7528\u4e8e\u7f51\u7edc\u722c\u866b\u7684\u5f00\u6e90\u6846\u67b6\uff0c\u7531 Python \u7f16\u5199\u3002<\/p>\n\n\n\n<p>Scrapy \u662f\u4e00\u4e2a\u5f3a\u5927\u4e14\u6613\u4e8e\u4f7f\u7528\u7684\u722c\u866b\u5de5\u5177\uff0c\u53ef\u4ee5\u5e2e\u52a9\u5f00\u53d1\u8005\u4ece\u7f51\u7ad9\u4e0a\u6293\u53d6\u548c\u63d0\u53d6\u6570\u636e\u3002\u5b83\u5177\u6709\u8bb8\u591a\u4f18\u70b9\uff0c\u5982\u9ad8\u5ea6\u53ef\u5b9a\u5236\u3001\u6613\u4e8e\u5b66\u4e60\u3001\u7a33\u5b9a\u4e14\u9ad8\u6548\u3002<\/p>\n\n\n\n<p><br><strong>Scrapy \u7684\u6838\u5fc3\u529f\u80fd\u5982\u4e0b\uff1a<\/strong><\/p>\n\n\n\n<p><br>1. \u7b80\u5355\u6613\u7528\u7684 API\uff1aScrapy \u63d0\u4f9b\u4e86\u4e00\u5957\u7b80\u6d01\u7684 API\uff0c\u4f7f\u5f97\u5f00\u53d1\u8005\u53ef\u4ee5\u8f7b\u677e\u5730\u7f16\u5199\u722c\u866b\u7a0b\u5e8f\u3002<br>2. \u5f3a\u5927\u7684\u722c\u866b\u5f15\u64ce\uff1aScrapy \u5177\u6709\u5f3a\u5927\u7684\u722c\u866b\u5f15\u64ce\uff0c\u53ef\u4ee5\u81ea\u52a8\u8ddf\u8e2a\u7f51\u9875\u7684 HTML \u6807\u7b7e\u548c\u5c5e\u6027\uff0c\u9002\u5e94\u4e0d\u540c\u7684\u7f51\u7ad9\u7ed3\u6784\u3002<br>3. \u9009\u62e9\u6027\u722c\u53d6\uff1aScrapy \u652f\u6301\u6839\u636e\u9700\u6c42\u722c\u53d6\u7279\u5b9a\u5185\u5bb9\uff0c\u4f8b\u5982\uff1a\u63d0\u53d6\u6587\u7ae0\u6807\u9898\u3001\u6458\u8981\u3001\u56fe\u7247\u7b49\u3002<br>4. \u5f02\u6b65\u8bf7\u6c42\uff1aScrapy \u53ef\u4ee5\u8fdb\u884c\u5f02\u6b65\u8bf7\u6c42\uff0c\u907f\u514d\u62e5\u5835\u76ee\u6807\u670d\u52a1\u5668\uff0c\u63d0\u9ad8\u722c\u53d6\u6548\u7387\u3002<br>5. \u8bbe\u7f6e\u5ef6\u8fdf\uff1a\u53ef\u4ee5\u8bbe\u7f6e\u722c\u53d6\u4efb\u52a1\u7684\u6267\u884c\u65f6\u95f4\uff0c\u907f\u514d\u5bf9\u76ee\u6807\u670d\u52a1\u5668\u9020\u6210\u8fc7\u5927\u538b\u529b\u3002<br>6. \u8bbe\u5b9a\u722c\u53d6\u4e0a\u9650\uff1a\u53ef\u4ee5\u8bbe\u7f6e\u6bcf\u4e2a\u57df\u540d\u6216 URL \u7684\u6700\u5927\u8bf7\u6c42\u6570\uff0c\u4ee5\u9075\u5b88\u7f51\u7ad9\u7684\u722c\u866b\u653f\u7b56\u3002<br>7. \u4ee3\u7406\u652f\u6301\uff1a\u652f\u6301\u4f7f\u7528\u4ee3\u7406 IP \u8fdb\u884c\u722c\u53d6\uff0c\u63d0\u9ad8\u9690\u853d\u6027\u548c\u5b89\u5168\u6027\u3002<br>8. \u5206\u5e03\u5f0f\u8c03\u5ea6\uff1aScrapy \u652f\u6301\u5206\u5e03\u5f0f\u8c03\u5ea6\uff0c\u53ef\u4ee5\u5c06\u722c\u53d6\u4efb\u52a1\u5206\u53d1\u5230\u591a\u53f0\u670d\u52a1\u5668\u4e0a\uff0c\u63d0\u9ad8\u722c\u53d6\u901f\u5ea6\u3002<br>9. \u5929\u7136\u53cd\u53cd\u722c\u866b\uff1aScrapy \u9075\u5faa 200 \u884c\u89c4\u5219\uff0c\u5728\u4e00\u5b9a\u7a0b\u5ea6\u4e0a\u53ef\u4ee5\u7ed5\u8fc7\u53cd\u722c\u866b\u673a\u5236\u3002<br>10. \u5b58\u50a8\u548c\u89e3\u6790\u6570\u636e\uff1aScrapy \u53ef\u4ee5\u5c06\u722c\u53d6\u5230\u7684\u6570\u636e\u5b58\u50a8\u5230\u672c\u5730\uff0c\u5e76\u652f\u6301\u591a\u79cd\u6570\u636e\u683c\u5f0f\uff0c\u5982 CSV\u3001JSON\u3001XML \u7b49\u3002\u6b64\u5916\uff0cScrapy \u8fd8\u63d0\u4f9b\u4e86\u6570\u636e\u89e3\u6790\u529f\u80fd\uff0c\u65b9\u4fbf\u5f00\u53d1\u8005\u63d0\u53d6\u6240\u9700\u4fe1\u606f\u3002<br>11. \u6269\u5c55\u6027\u5f3a\uff1aScrapy \u6709\u4e30\u5bcc\u7684\u6269\u5c55\u5e93\u548c\u4e2d\u95f4\u4ef6\uff0c\u53ef\u4ee5\u6839\u636e\u5b9e\u9645\u9700\u6c42\u8fdb\u884c\u5b9a\u5236\u3002<\/p>\n\n\n\n<p><strong>\u603b\u4e4b\uff0cScrapy \u662f\u4e00\u4e2a\u529f\u80fd\u4e30\u5bcc\u3001\u6613\u4e8e\u4f7f\u7528\u7684\u7f51\u7edc\u722c\u866b\u6846\u67b6\uff0c\u5e7f\u6cdb\u5e94\u7528\u4e8e\u6570\u636e\u6316\u6398\u3001\u7f51\u7edc\u76d1\u6d4b\u3001\u7ade\u4e89\u5206\u6790\u7b49\u9886\u57df\u3002<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"576\" src=\"http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/c63ef599328e4945b14ec22fc20002a2_1-1024x576.png\" alt=\"\" class=\"wp-image-481\" srcset=\"http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/c63ef599328e4945b14ec22fc20002a2_1-1024x576.png 1024w, http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/c63ef599328e4945b14ec22fc20002a2_1-300x169.png 300w, http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/c63ef599328e4945b14ec22fc20002a2_1-768x432.png 768w, http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/c63ef599328e4945b14ec22fc20002a2_1.png 1280w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p><strong>\u5728\u672c\u6587\u4e2d\uff0c\u6211\u4eec\u5c06\u4e3a<em>\u5927\u5bb6<\/em>\u6784\u5efa\u4e00\u4e2a\u6293\u53d6\u5de5\u5177\uff0c\u5e0c\u671b\u7528 Python \u7a0b\u5e8f\u4ece&nbsp;<\/strong><em><strong>\u76ee\u6807\u7f51\u7ad9<\/strong><\/em><strong>&nbsp; \u6293\u53d6\u6570\u636e\u4ee5\u6293\u53d6\u65b0\u95ee\u9898\uff08\u95ee\u9898\u6807\u9898\u548c URL\uff09\u3002\u7136\u540e\uff0c\u6293\u53d6\u7684\u6570\u636e\u5e94\u5b58\u50a8\u5728MongoDB\u4e2d\u3002<\/strong>\u503c\u5f97\u6ce8\u610f\u7684\u662f\uff0cStack Overflow \u6709\u4e00\u4e2a&nbsp;API\uff0c\u53ef\u7528\u4e8e\u8bbf\u95ee<em>\u5b8c\u5168\u76f8\u540c<\/em>\u7684\u6570\u636e\u3002<\/p>\n\n\n\n<p>\u4f46\u662f\uff0c\u4eca\u5929\u6211\u4eec\u8981\u81ea\u5df1\u5b9e\u73b0\u4e00\u4e2a \u6293\u53d6\u5de5\u5177\uff0c\u6240\u4ee5 \u5f00\u59cb\u5427\uff1a<\/p>\n\n\n\n<p><strong>\u7b2c\u4e00\uff1a\u6293\u53d6\u5bf9\u8c61\u7f51\u7ad9\u7684\u60c5\u51b5<\/strong><\/p>\n\n\n\n<p>\u5728\u5f00\u59cb\u4efb\u4f55\u6293\u53d6\u5de5\u4f5c\u4e4b\u524d\uff0c\u8bf7\u52a1\u5fc5\u67e5\u770b\u7f51\u7ad9\u7684\u4f7f\u7528\/\u670d\u52a1\u6761\u6b3e\u5e76\u5c0a\u91cd<em>&nbsp;robot.txt<\/em>\u6587\u4ef6\u3002\u786e\u4fdd\u9075\u5b88\u5408\u4e4e\u9053\u5fb7\u7684\u6293\u53d6\u505a\u6cd5\uff0c\u4e0d\u8981\u5728\u77ed\u65f6\u95f4\u5185\u7528\u5927\u91cf\u8bf7\u6c42\u6df9\u6ca1\u7f51\u7ad9\u3002\u5c06\u60a8\u6293\u53d6\u7684\u4efb\u4f55\u7f51\u7ad9\u89c6\u4e3a\u60a8\u81ea\u5df1\u7684\u7f51\u7ad9\u3002\u4e0d\u80fd\u901a\u8fc7\u9ad8\u9891\u6293\u53d6\uff0c\u628a\u4eba\u5bb6\u7f51\u7ad9\u7ed9\u6293\u5954\u6e83\u4e86\u3002<\/p>\n\n\n\n<p><strong>\u7b2c\u4e8c\uff1a\u5b89\u88c5<\/strong>\u73af\u5883<\/p>\n\n\n\n<p>\u6211\u4eec\u9700\u8981&nbsp;Scrapy&nbsp;\u5e93 \uff08v1.0.3\uff09 \u548c&nbsp;PyMongo&nbsp;\uff08v3.0.3\uff09 \u6765\u5c06\u6570\u636e\u5b58\u50a8\u5728&nbsp;MongoDB&nbsp;\u4e2d\u3002\u60a8\u8fd8\u9700\u8981\u5b89\u88c5&nbsp;MongoDB<\/p>\n\n\n\n<p>\u5982\u679c\u60a8\u8fd0\u884c\u7684\u662f OSX \u6216 Linux \u7248\u672c\uff0c\u8bf7\u4f7f\u7528 pip \u5b89\u88c5 Scrapy\uff1a<\/p>\n\n\n\n<p><em>&nbsp;&nbsp; shell<\/em><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>$ pip install Scrapy==1.0.3<br>$ pip freeze &gt; requirements.txt<\/code><\/pre>\n\n\n\n<p>\u5982\u679c\u60a8\u4f7f\u7528\u7684\u662f Windows \u8ba1\u7b97\u673a\uff0c\u5219\u9700\u8981\u624b\u52a8\u5b89\u88c5\u591a\u4e2a\u4f9d\u8d56\u9879\u3002\u6709\u5173\u8be6\u7ec6\u8bf4\u660e\uff0c\u8bf7\u53c2\u9605\u5b98\u65b9\u6587\u6863\u4ee5\u53ca\u6211\u521b\u5efa\u7684\u8fd9\u4e2a Youtube \u89c6\u9891\u3002<\/p>\n\n\n\n<p>\u8bbe\u7f6e Scrapy \u540e\uff0c\u901a\u8fc7\u5728&nbsp;Python \u4ee3\u7801\u4e2d\u8fd0\u884c\u4ee5\u4e0b\u547d\u4ee4\u6765\u9a8c\u8bc1\u60a8\u7684\u5b89\u88c5<\/p>\n\n\n\n<p>python<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>&gt;&gt;&gt; import scrapy<br>&gt;&gt;&gt;<\/code><\/pre>\n\n\n\n<p>\u5982\u679c\u4f60\u6ca1\u6709\u6536\u5230\u9519\u8bef\uff0c\u90a3\u4e48\u4f60\u5c31\u53ef\u4ee5\u5f00\u59cb\u4e86\uff01<\/p>\n\n\n\n<p>\u5b89\u88c5 PyMongo with pip:<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>$ pip install pymongo<br>$ pip freeze &gt; requirements.txt<\/code><\/pre>\n\n\n\n<p><strong>\u73b0\u5728\u5c31\u53ef\u4ee5\u6784\u5efa\u9879\u76ee<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>scrapy \u53ef\u4ee5\u63d0\u4f9b\u547d\u4ee4\u53bb\u6784\u5efa\u4e00\u4e2a\u9879\u76ee<\/code><\/pre>\n\n\n\n<p>\u65b0\u5efa Scrapy project:<\/p>\n\n\n\n<p><code>$ scrapy startproject&nbsp;<\/code><code>stack<\/code><br><code>2015-09-05 20:56:40 [scrapy] INFO: Scrapy 1.0.3 started (bot: scrapybot)<br>2015-09-05 20:56:40 [scrapy] INFO: Optional features available: ssl, http11<br>2015-09-05 20:56:40 [scrapy] INFO: Overridden settings: {}<br>New Scrapy project 'stack' created in:<br>\/stack-spider\/<\/code><code>stack<\/code><code><br><br>You can start your first spider with:<br>cd stack<br>scrapy genspider example example.com<\/code><\/p>\n\n\n\n<p><strong>\u521b\u5efa\u4e86\u4e00\u4e2a\u76ee\u5f55:<\/strong><code>\u251c\u2500\u2500 scrapy.cfg<br>\u2514\u2500\u2500<\/code><code> stack<\/code><code><br>\u251c\u2500\u2500 __init__.py<br>\u251c\u2500\u2500 items.py<br>\u251c\u2500\u2500 pipelines.py<br>\u251c\u2500\u2500 settings.py<br>\u2514\u2500\u2500 spiders<br>\u2514\u2500\u2500 __init__.py<\/code><\/p>\n\n\n\n<h3 class=\"wp-block-heading\"><\/h3>\n\n\n\n<p>items.py&nbsp;\u6587\u4ef6\u7528\u4e8e\u4e3a\u6211\u4eec\u8ba1\u5212\u6293\u53d6\u7684\u6570\u636e\u5b9a\u4e49\u5b58\u50a8\u201c\u5bb9\u5668\u201d\u3002<\/p>\n\n\n\n<p>\u8be5\u7c7b\u7ee7\u627f\u81ea \uff08docs\uff09\uff0c\u5b83\u57fa\u672c\u4e0a\u6709\u8bb8\u591a Scrapy \u5df2\u7ecf\u4e3a\u6211\u4eec\u6784\u5efa\u7684\u9884\u5b9a\u4e49\u5bf9\u8c61\uff1a<code>StackItem()<\/code><code>Item<\/code><\/p>\n\n\n\n<p><em><code>python<\/code><\/em><code><br><\/code><\/p>\n\n\n\n<p><code>import scrapy<\/code><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code><br>class StackItem(scrapy.Item):<br>    # define the fields for your item here like:<br>    # name = scrapy.Field()<br>    pass<\/code><\/pre>\n\n\n\n<p>\u8ba9\u6211\u4eec\u6dfb\u52a0\u4e00\u4e9b\u6211\u4eec\u771f\u6b63\u60f3\u8981\u6536\u96c6\u7684\u7269\u54c1\u3002\u5bf9\u4e8e\u6bcf\u4e2a\u95ee\u9898\uff0c\u5ba2\u6237\u90fd\u9700\u8981\u6807\u9898\u548c URL\u3002\u56e0\u6b64\uff0c\u8bf7\u50cf\u8fd9\u6837\u66f4\u65b0&nbsp;items.py\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>from scrapy.item import Item, Field<br>class StackItem(Item):<br>    title = Field()<br>    url = Field()<\/code><\/pre>\n\n\n\n<p><strong>\u73b0\u5728\u5c31\u53ef\u4ee5\u6784\u5efa\u722c\u866b<\/strong><\/p>\n\n\n\n<p>\u5728\u201cspiders\u201d\u76ee\u5f55\u4e2d\u521b\u5efa\u4e00\u4e2a\u540d\u4e3a&nbsp;<em>stack_spider.py<\/em>&nbsp;\u7684\u6587\u4ef6\u3002<\/p>\n\n\n\n<p>\u9996\u5148\u5b9a\u4e49\u4e00\u4e2a\u7ee7\u627f\u81ea Scrapy \u7684\u7c7b\uff0c\u7136\u540e\u6839\u636e\u9700\u8981\u6dfb\u52a0\u5c5e\u6027\uff1a<code>Spider<\/code><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>from scrapy import Spider<br><br><br>class StackSpider(Spider):<br>    name = \"stack\"<br>    allowed_domains = &#91;\"stackoverflow.com\"]<br>    start_urls = &#91;<br>        \"http:\/\/stackoverflow.com\/questions?pagesize=50&amp;sort=newest\",<br>    ]<\/code><\/pre>\n\n\n\n<p>\u524d\u51e0\u4e2a\u53d8\u91cf\u662f\u4e0d\u8a00\u81ea\u660e\u7684\uff08\u6587\u6863\uff09\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><code>name<\/code>\u5b9a\u4e49 Spider \u7684\u540d\u79f0\u3002<\/li>\n\n\n\n<li><code>allowed_domains<\/code>\u5305\u542b\u5141\u8bb8\u722c\u866b\u722c\u7f51\u7684\u57df\u7684 base-URL\u3002<\/li>\n\n\n\n<li><code>start_urls<\/code>\u662f\u722c\u866b\u8981\u4ece\u4e2d\u5f00\u59cb\u6293\u53d6\u7684 URL \u5217\u8868\u3002\u6240\u6709\u540e\u7eed URL \u90fd\u5c06\u4ece\u722c\u866b\u4ece \u4e2d\u7684 URL \u4e0b\u8f7d\u7684\u6570\u636e\u5f00\u59cb\u3002<code>start_urls<\/code><\/li>\n<\/ul>\n\n\n\n<h3 class=\"wp-block-heading\">XPath&nbsp;\u9009\u62e9\u5668<\/h3>\n\n\n\n<p>\u63a5\u4e0b\u6765\uff0c<strong>Scrapy<\/strong>&nbsp;\u4f7f\u7528<strong>&nbsp;XPath&nbsp;<\/strong>\u9009\u62e9\u5668\u4ece\u7f51\u7ad9\u4e2d\u63d0\u53d6\u6570\u636e\u3002\u6362\u53e5\u8bdd\u8bf4\uff0c\u6211\u4eec\u53ef\u4ee5\u6839\u636e\u7ed9\u5b9a\u7684 XPath \u9009\u62e9 HTML \u6570\u636e\u7684\u67d0\u4e9b\u90e8\u5206\u3002\u6b63\u5982 Scrapy \u7684\u6587\u6863\u4e2d\u6240\u8ff0\uff0c\u201cXPath \u662f\u4e00\u79cd\u7528\u4e8e\u5728 XML \u6587\u6863\u4e2d\u9009\u62e9\u8282\u70b9\u7684\u8bed\u8a00\uff0c\u5b83\u4e5f\u53ef\u4ee5\u4e0e HTML \u4e00\u8d77\u4f7f\u7528\u3002<\/p>\n\n\n\n<p>\u60a8\u53ef\u4ee5\u4f7f\u7528 Chrome \u7684\u5f00\u53d1\u8005\u5de5\u5177\u8f7b\u677e\u627e\u5230\u7279\u5b9a\u7684 Xpath\u3002\u53ea\u9700\u68c0\u67e5\u7279\u5b9a\u7684 HTML \u5143\u7d20\uff0c\u590d\u5236 XPath\uff0c\u7136\u540e\u6839\u636e\u9700\u8981\u8fdb\u884c\u8c03\u6574\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"596\" src=\"http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp1-1024x596.png\" alt=\"\" class=\"wp-image-482\" srcset=\"http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp1-1024x596.png 1024w, http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp1-300x175.png 300w, http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp1-768x447.png 768w, http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp1.png 1080w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p>\u5f00\u53d1\u4eba\u5458\u5de5\u5177\u8fd8\u4f7f\u60a8\u80fd\u591f\u5728 JavaScript \u63a7\u5236\u53f0\u4e2d\u4f7f\u7528\u4ee5\u4e0b\u65b9\u6cd5\u6d4b\u8bd5 XPath \u9009\u62e9\u5668\uff1a<code>$x$x(\"\/\/img\")<\/code><\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"530\" src=\"http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp2-1024x530.png\" alt=\"\" class=\"wp-image-483\" srcset=\"http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp2-1024x530.png 1024w, http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp2-300x155.png 300w, http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp2-768x398.png 768w, http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp2.png 1080w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p>\u540c\u6837\uff0c\u6211\u4eec\u57fa\u672c\u4e0a\u544a\u8bc9 Scrapy \u4ece\u54ea\u91cc\u5f00\u59cb\u6839\u636e\u5b9a\u4e49\u7684 XPath \u67e5\u627e\u4fe1\u606f\u3002\u8ba9\u6211\u4eec\u5bfc\u822a\u5230 Chrome \u4e2d\u7684 Stack Overflow \u7ad9\u70b9\u5e76\u627e\u5230 XPath \u9009\u62e9\u5668\u3002<\/p>\n\n\n\n<p>\u53f3\u952e\u5355\u51fb\u7b2c\u4e00\u4e2a\u95ee\u9898\uff0c\u7136\u540e\u9009\u62e9\u201c\u68c0\u67e5\u5143\u7d20\u201d\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" decoding=\"async\" width=\"914\" height=\"585\" src=\"http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp3.png\" alt=\"\" class=\"wp-image-484\" srcset=\"http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp3.png 914w, http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp3-300x192.png 300w, http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp3-768x492.png 768w\" sizes=\"auto, (max-width: 914px) 100vw, 914px\" \/><\/figure>\n\n\n\n<p>\u73b0\u5728\u83b7\u53d6 \u3001 \u7684 XPath\uff0c\u7136\u540e\u5728 JavaScript \u63a7\u5236\u53f0\u4e2d\u5bf9\u5176\u8fdb\u884c\u6d4b\u8bd5\uff1a<code>&lt;div class=\"summary\">\/\/*[@id=\"question-summary-27624141\"]\/div[2]<\/code><\/p>\n\n\n\n<p><\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" decoding=\"async\" width=\"966\" height=\"517\" src=\"http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp4.png\" alt=\"\" class=\"wp-image-485\" srcset=\"http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp4.png 966w, http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp4-300x161.png 300w, http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp4-768x411.png 768w\" sizes=\"auto, (max-width: 966px) 100vw, 966px\" \/><\/figure>\n\n\n\n<p>\u6b63\u5982\u4f60\u6240\u77e5\u9053\u7684\uff0c\u5b83\u53ea\u662f\u9009\u62e9\u4e86<em>\u4e00\u4e2a\u95ee\u9898<\/em>\u3002\u56e0\u6b64\uff0c\u6211\u4eec\u9700\u8981\u66f4\u6539 XPath \u6765\u83b7\u53d6<em>\u6240\u6709<\/em>\u95ee\u9898\u3002\u6709\u4ec0\u4e48\u60f3\u6cd5\u5417\uff1f\u8fd9\u5f88\u7b80\u5355\uff1a.\u8fd9\u662f\u4ec0\u4e48\u610f\u601d\uff1f\u4ece\u672c\u8d28\u4e0a\u8bb2\uff0c\u6b64 XPath \u58f0\u660e\uff1a<em>\u83b7\u53d6\u6240\u6709&nbsp;<code>&lt;h3&gt;<\/code>&nbsp;\u5143\u7d20\uff0c\u8fd9\u4e9b\u5143\u7d20\u662f\u5177\u6709&nbsp;<code>summary<\/code>&nbsp;\u7c7b\u7684&nbsp;<code>&lt;div&gt;<\/code>&nbsp;\u7684\u5b50\u5143\u7d20<\/em>\u3002\u5728 JavaScript \u63a7\u5236\u53f0\u4e2d\u6d4b\u8bd5\u6b64 XPath\u3002<code>\/\/div[@class=\"summary\"]\/h3<\/code><\/p>\n\n\n\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\">\n<p>\u8bf7\u6ce8\u610f\uff0c\u6211\u4eec\u6ca1\u6709\u4f7f\u7528 Chrome \u5f00\u53d1\u8005\u5de5\u5177\u7684\u5b9e\u9645 XPath \u8f93\u51fa\u3002\u5728\u5927\u591a\u6570\u60c5\u51b5\u4e0b\uff0c\u8f93\u51fa\u53ea\u662f\u4e00\u4e2a\u6709\u7528\u7684\u65c1\u767d\uff0c\u5b83\u901a\u5e38\u4e3a\u60a8\u6307\u660e\u67e5\u627e\u5de5\u4f5c XPath \u7684\u6b63\u786e\u65b9\u5411\u3002<\/p>\n<\/blockquote>\n\n\n\n<p>\u73b0\u5728\u8ba9\u6211\u4eec\u66f4\u65b0<em>stack_spider.py<\/em>\u811a\u672c\uff1a<\/p>\n\n\n\n<p><em>stack_spider.py<\/em>&nbsp;script:<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>from scrapy import Spider<br>from scrapy.selector import Selector<br><br><br>class StackSpider(Spider):<br>    name = \"stack\"<br>    allowed_domains = &#91;\"stackoverflow.com\"]<br>    start_urls = &#91;<br>        \"http:\/\/stackoverflow.com\/questions?pagesize=50&amp;sort=newest\",<br>    ]<br><br>    def parse(self, response):<br>        questions = Selector(response).xpath('\/\/div&#91;@class=\"summary\"]\/h3')<\/code><\/pre>\n\n\n\n<h3 class=\"wp-block-heading\">\u63d0\u53d6\u6570\u636e<\/h3>\n\n\n\n<p>\u6211\u4eec\u4ecd\u7136\u9700\u8981\u89e3\u6790\u548c\u6293\u53d6\u6211\u4eec\u60f3\u8981\u7684\u6570\u636e\uff0c\u50cf\u8fd9\u6837\u66f4\u65b0<em>s<strong>tack_spider.py<\/strong><\/em>\uff1a<\/p>\n\n\n\n<p><code>&lt;div class=\"summary\"&gt;&lt;h3&gt;<\/code><\/p>\n\n\n\n<p><code>like so:<\/code><code>from scrapy import Spider<br>from scrapy.selector import Selector<br><br>from stack.items import StackItem<br><br><br>class StackSpider(Spider):<br>name = \"stack\"<br>allowed_domains = [\"stackoverflow.com\"]<br>start_urls = [<br>\"http:\/\/stackoverflow.com\/questions?pagesize=50&amp;sort=newest\",<br>]<br><br>def parse(self, response):<br>questions = Selector(response).xpath('\/\/div[@class=\"summary\"]\/h3')<br><br>for question in questions:<br>item = StackItem()<br>item['title'] = question.xpath(<br>'a[@class=\"question-hyperlink\"]\/text()').extract()[0]<br>item['url'] = question.xpath(<br>'a[@class=\"question-hyperlink\"]\/@href').extract()[0]<br>yield item<br>````<br><br>We are iterating through the `questions` and assigning the `title` and `url` values from the scraped data. Be sure to test out the XPath selectors in the JavaScript Console within Chrome Developer Tools - e.g., `$x('\/\/div[@class=\"summary\"]\/h3\/a[@class=\"question-hyperlink\"]\/text()')` and `$x('\/\/div[@class=\"summary\"]\/h3\/a[@class=\"question-hyperlink\"]\/@href')`.<br><\/code><code><br>## Test<br><\/code><\/p>\n\n\n\n<p><strong><code>\u8dd1\u4e00\u4e0b\u8bd5\u8bd5<\/code><\/strong><\/p>\n\n\n\n<p><code>```console<\/code><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>$ scrapy crawl stack<\/code><\/pre>\n\n\n\n<p><code>\u9664\u4e86 Scrapy \u5806\u6808\u8ddf\u8e2a\u5916\uff0c\u60a8\u8fd8\u5e94\u8be5\u770b\u5230\u8f93\u51fa\u7684 50 \u4e2a\u95ee\u9898\u6807\u9898\u548c URL\u3002\u60a8\u53ef\u4ee5\u4f7f\u7528\u4ee5\u4e0b\u5c0f\u547d\u4ee4\u5c06\u8f93\u51fa\u5448\u73b0\u5230 JSON \u6587\u4ef6\uff1a<\/code><\/p>\n\n\n\n<p><code>command:<\/code><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>$ scrapy crawl stack -o items.json -t json<\/code><\/pre>\n\n\n\n<p>\u6211\u4eec\u73b0\u5728\u5df2\u7ecf\u6839\u636e\u6211\u4eec\u6b63\u5728\u5bfb\u627e\u7684\u6570\u636e\u5b9e\u73b0\u4e86\u6211\u4eec\u7684 Spider\u3002\u73b0\u5728\u6211\u4eec\u9700\u8981\u5c06\u6293\u53d6\u7684\u6570\u636e\u5b58\u50a8\u5728MongoDB\u4e2d\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u5c06\u6570\u636e\u5b58\u50a8\u5728 MongoDB \u4e2d<\/h2>\n\n\n\n<p>\u6bcf\u6b21\u8fd4\u56de\u9879\u76ee\u65f6\uff0c\u6211\u4eec\u90fd\u5e0c\u671b\u9a8c\u8bc1\u6570\u636e\uff0c\u7136\u540e\u5c06\u5176\u6dfb\u52a0\u5230 Mongo \u96c6\u5408\u4e2d\u3002<\/p>\n\n\n\n<p>\u7b2c\u4e00\u6b65\u662f\u521b\u5efa\u6211\u4eec\u8ba1\u5212\u7528\u4e8e\u4fdd\u5b58\u6240\u6709\u5df2\u722c\u7f51\u6570\u636e\u7684\u6570\u636e\u5e93\u3002\u6253\u5f00&nbsp;<em>settings.py<\/em>&nbsp;\u5e76\u6307\u5b9a\u7ba1\u9053\u5e76\u6dfb\u52a0\u6570\u636e\u5e93\u8bbe\u7f6e\uff1a<\/p>\n\n\n\n<p>database settings:<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>ITEM_PIPELINES = &#91;'stack.pipelines.MongoDBPipeline', ]<br><br>MONGODB_SERVER = \"localhost\"<br>MONGODB_PORT = 27017<br>MONGODB_DB = \"stackoverflow\"<br>MONGODB_COLLECTION = \"questions\"<\/code><\/pre>\n\n\n\n<h3 class=\"wp-block-heading\">\u7ba1\u9053\u7ba1\u7406<\/h3>\n\n\n\n<p>\u6211\u4eec\u5df2\u7ecf\u8bbe\u7f6e\u4e86\u722c\u866b\u6765\u6293\u53d6\u548c\u89e3\u6790 HTML\uff0c\u5e76\u4e14\u6211\u4eec\u5df2\u7ecf\u8bbe\u7f6e\u4e86\u6570\u636e\u5e93\u8bbe\u7f6e\u3002\u73b0\u5728\uff0c\u6211\u4eec\u5fc5\u987b\u901a\u8fc7&nbsp;<em>pipelines.py<\/em>&nbsp;\u4e2d\u7684\u7ba1\u9053\u5c06\u4e24\u8005\u8fde\u63a5\u5728\u4e00\u8d77\u3002<\/p>\n\n\n\n<p><strong>\u8fde\u63a5\u5230\u6570\u636e\u5e93<\/strong><\/p>\n\n\n\n<p>\u9996\u5148\uff0c\u8ba9\u6211\u4eec\u5b9a\u4e49\u4e00\u4e2a\u5b9e\u9645\u8fde\u63a5\u5230\u6570\u636e\u5e93\u7684\u65b9\u6cd5\uff1a<\/p>\n\n\n\n<p>connect to the database:<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>import pymongo<br><br>from scrapy.conf import settings<br><br>class MongoDBPipeline(object):<br><br>    def __init__(self):<br>        connection = pymongo.MongoClient(<br>            settings&#91;'MONGODB_SERVER'],<br>            settings&#91;'MONGODB_PORT']<br>        )<br>        db = connection&#91;settings&#91;'MONGODB_DB']]<br>        self.collection = db&#91;settings&#91;'MONGODB_COLLECTION']]<\/code><\/pre>\n\n\n\n<p>\u5728\u8fd9\u91cc\uff0c\u6211\u4eec\u521b\u5efa\u4e00\u4e2a\u7c7b\uff0c\u6211\u4eec\u6709\u4e00\u4e2a\u6784\u9020\u51fd\u6570\uff0c\u901a\u8fc7\u5b9a\u4e49 Mongo \u8bbe\u7f6e\u7136\u540e\u8fde\u63a5\u5230\u6570\u636e\u5e93\u6765\u521d\u59cb\u5316\u7c7b\u3002<code>MongoDBPipeline()<\/code><\/p>\n\n\n\n<p><strong>\u5904\u7406\u6570\u636e<\/strong><\/p>\n\n\n\n<p>\u63a5\u4e0b\u6765\uff0c\u6211\u4eec\u9700\u8981\u5b9a\u4e49\u4e00\u4e2a\u65b9\u6cd5\u6765\u5904\u7406\u89e3\u6790\u540e\u7684\u6570\u636e\uff1a<\/p>\n\n\n\n<p>process the parsed data:<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>import pymongo<br><br>from scrapy.conf import settings<br>from scrapy.exceptions import DropItem<br>from scrapy import log<br><br>class MongoDBPipeline(object):<br><br>    def __init__(self):<br>        connection = pymongo.MongoClient(<br>            settings&#91;'MONGODB_SERVER'],<br>            settings&#91;'MONGODB_PORT']<br>        )<br>        db = connection&#91;settings&#91;'MONGODB_DB']]<br>        self.collection = db&#91;settings&#91;'MONGODB_COLLECTION']]<br><br>    def process_item(self, item, spider):<br>        valid = True<br>        for data in item:<br>            if not data:<br>                valid = False<br>                raise DropItem(\"Missing {0}!\".format(data))<br>        if valid:<br>            self.collection.insert(dict(item))<br>            log.msg(\"Question added to MongoDB database!\",<br>                    level=log.DEBUG, spider=spider)<br>        return item<\/code><\/pre>\n\n\n\n<p>\u6211\u4eec\u5efa\u7acb\u4e0e\u6570\u636e\u5e93\u7684\u8fde\u63a5\uff0c\u89e3\u538b\u7f29\u6570\u636e\uff0c\u7136\u540e\u5c06\u5176\u4fdd\u5b58\u5230\u6570\u636e\u5e93\u4e2d\u3002\u73b0\u5728\u6211\u4eec\u53ef\u4ee5\u518d\u6b21\u6d4b\u8bd5\u4e86\uff01<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u6d4b\u8bd5<\/h2>\n\n\n\n<p>\u518d\u6b21\uff0c\u5728\u201cstack\u201d\u76ee\u5f55\u4e2d\u8fd0\u884c\u4ee5\u4e0b\u547d\u4ee4\uff1a<\/p>\n\n\n\n<p>the \u201cstack\u201d directory:<strong><code>$ scrapy crawl stack<\/code><\/strong><\/p>\n\n\n\n<p>\u4e07\u5c81\uff01\u6211\u4eec\u5df2\u6210\u529f\u5c06\u6293\u53d6\u7684\u6570\u636e\u5b58\u50a8\u5230\u6570\u636e\u5e93\u4e2d\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"631\" src=\"http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp5-1024x631.png\" alt=\"\" class=\"wp-image-486\" srcset=\"http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp5-1024x631.png 1024w, http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp5-300x185.png 300w, http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp5-768x474.png 768w, http:\/\/ai.gitpp.com\/wp-content\/uploads\/2023\/11\/scrp5.png 1080w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\">\u7ed3\u8bba<\/h2>\n\n\n\n<p>\u8fd9\u662f\u4e00\u4e2a\u975e\u5e38\u7b80\u5355\u7684\u793a\u4f8b\uff0c\u4f7f\u7528 Scrapy \u6293\u53d6\u548c\u6293\u53d6\u7f51\u9875\u3002\u5b9e\u9645\u7684\u81ea\u7531\u804c\u4e1a\u8005\u9879\u76ee\u8981\u6c42\u811a\u672c\u9075\u5faa\u5206\u9875\u94fe\u63a5\u5e76\u4f7f\u7528 \uff08docs\uff09 \u6293\u53d6\u6bcf\u4e2a\u9875\u9762\uff0c\u8fd9\u975e\u5e38\u5bb9\u6613\u5b9e\u73b0\u3002\u5c1d\u8bd5\u81ea\u5df1\u5b9e\u73b0\u5b83\uff0c\u5e76\u5728\u4e0b\u9762\u53d1\u8868\u8bc4\u8bba\uff0c\u5e76\u9644\u4e0a Github \u5b58\u50a8\u5e93\u7684\u94fe\u63a5\uff0c\u4ee5\u4fbf\u5feb\u901f\u67e5\u770b\u4ee3\u7801\u3002<\/p>\n\n\n\n<p>github\u5730\u5740\uff1a<\/p>\n\n\n\n<p>\u6e90\u4ee3\u7801<\/p>\n\n\n\n<figure class=\"wp-block-embed\"><div class=\"wp-block-embed__wrapper\">\nhttps:\/\/github.com\/realpython\/stack-spider\/releases\/tag\/v1\n<\/div><\/figure>\n\n\n\n<p>\u6211\u4eec\u6536\u96c6\u4e8610000+\u5f00\u6e90\u9879\u76ee\uff0c \u70b9\u51fb&nbsp;\u9605\u8bfb\u539f\u6587<\/p>\n","protected":false},"excerpt":{"rendered":"<p>Scrapy\u4ecb\u7ecd Scrapy\u662f\u4e00\u4e2a\u7528\u4e8e\u7f51\u7edc\u722c\u866b\u7684\u5f00\u6e90\u6846\u67b6\uff0c\u7531 Python \u7f16\u5199\u3002 Scrapy \u662f\u4e00\u4e2a\u5f3a\u5927 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[10,21],"tags":[],"class_list":["post-480","post","type-post","status-publish","format-standard","hentry","category-python"],"blocksy_meta":"","_links":{"self":[{"href":"http:\/\/ai.gitpp.com\/index.php\/wp-json\/wp\/v2\/posts\/480","targetHints":{"allow":["GET"]}}],"collection":[{"href":"http:\/\/ai.gitpp.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/ai.gitpp.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/ai.gitpp.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/ai.gitpp.com\/index.php\/wp-json\/wp\/v2\/comments?post=480"}],"version-history":[{"count":1,"href":"http:\/\/ai.gitpp.com\/index.php\/wp-json\/wp\/v2\/posts\/480\/revisions"}],"predecessor-version":[{"id":487,"href":"http:\/\/ai.gitpp.com\/index.php\/wp-json\/wp\/v2\/posts\/480\/revisions\/487"}],"wp:attachment":[{"href":"http:\/\/ai.gitpp.com\/index.php\/wp-json\/wp\/v2\/media?parent=480"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/ai.gitpp.com\/index.php\/wp-json\/wp\/v2\/categories?post=480"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/ai.gitpp.com\/index.php\/wp-json\/wp\/v2\/tags?post=480"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}