Skip to content

Commit beeeb6f

Browse files
Add files via upload
1 parent 23ad9de commit beeeb6f

18 files changed

+1218
-1
lines changed

README.md

+711-1
Large diffs are not rendered by default.

requests-beautifulsoup-scraper.py

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
2+
import requests
3+
from bs4 import BeautifulSoup
4+
import csv
5+
6+
# URL of the page to scrape
7+
url = "https://en.wikipedia.org/wiki/Web_scraping"
8+
9+
# Send a GET request to the URL and get the response
10+
response = requests.get(url)
11+
12+
# Get the HTML content of the page
13+
html = response.text
14+
15+
# Parse the HTML content with BeautifulSoup
16+
soup = BeautifulSoup(html, "html.parser")
17+
18+
# List where to store the scraped titles
19+
titles = []
20+
21+
# List of header levels (h1, h2, h3, h4, h5)
22+
title_level_list = [1, 2, 3, 4, 5]
23+
24+
# Loop through each header level (h1, h2, h3, h4, h5)
25+
for title_level in title_level_list:
26+
# Find all elements of the current header level
27+
title_elements = soup.find_all(f"h{title_level}")
28+
29+
# Loop through each title element found
30+
for title_element in title_elements:
31+
# Data extraction logic
32+
tag = title_element.name
33+
text = title_element.text
34+
35+
# Create a dictionary with the tag and the title text
36+
title = {
37+
"tag": tag,
38+
"title": text,
39+
}
40+
41+
# Append the dictionary to the titles list
42+
titles.append(title)
43+
44+
# Open a CSV file to write the data
45+
with open("titles.csv", mode="w", newline="", encoding="utf-8") as file:
46+
# Create a CSV writer object and specify the fieldnames (columns)
47+
writer = csv.DictWriter(file, fieldnames=["tag", "title"])
48+
49+
# Write the header (column names) to the CSV file
50+
writer.writeheader()
51+
52+
# Write each row (dictionary) to the CSV file
53+
for row in titles:
54+
writer.writerow(row)

scrapy_scraping/scrapy.cfg

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# https://scrapyd.readthedocs.io/en/latest/deploy.html
5+
6+
[settings]
7+
default = scrapy_scraping.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = scrapy_scraping

scrapy_scraping/scrapy_scraping/__init__.py

Whitespace-only changes.
Binary file not shown.
Binary file not shown.
+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Define here the models for your scraped items
2+
#
3+
# See documentation in:
4+
# https://docs.scrapy.org/en/latest/topics/items.html
5+
6+
import scrapy
7+
8+
9+
class ScrapyScrapingItem(scrapy.Item):
10+
# define the fields for your item here like:
11+
# name = scrapy.Field()
12+
pass
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# Define here the models for your spider middleware
2+
#
3+
# See documentation in:
4+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5+
6+
from scrapy import signals
7+
8+
# useful for handling different item types with a single interface
9+
from itemadapter import is_item, ItemAdapter
10+
11+
12+
class ScrapyScrapingSpiderMiddleware:
13+
# Not all methods need to be defined. If a method is not defined,
14+
# scrapy acts as if the spider middleware does not modify the
15+
# passed objects.
16+
17+
@classmethod
18+
def from_crawler(cls, crawler):
19+
# This method is used by Scrapy to create your spiders.
20+
s = cls()
21+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
22+
return s
23+
24+
def process_spider_input(self, response, spider):
25+
# Called for each response that goes through the spider
26+
# middleware and into the spider.
27+
28+
# Should return None or raise an exception.
29+
return None
30+
31+
def process_spider_output(self, response, result, spider):
32+
# Called with the results returned from the Spider, after
33+
# it has processed the response.
34+
35+
# Must return an iterable of Request, or item objects.
36+
for i in result:
37+
yield i
38+
39+
def process_spider_exception(self, response, exception, spider):
40+
# Called when a spider or process_spider_input() method
41+
# (from other spider middleware) raises an exception.
42+
43+
# Should return either None or an iterable of Request or item objects.
44+
pass
45+
46+
def process_start_requests(self, start_requests, spider):
47+
# Called with the start requests of the spider, and works
48+
# similarly to the process_spider_output() method, except
49+
# that it doesn’t have a response associated.
50+
51+
# Must return only requests (not items).
52+
for r in start_requests:
53+
yield r
54+
55+
def spider_opened(self, spider):
56+
spider.logger.info("Spider opened: %s" % spider.name)
57+
58+
59+
class ScrapyScrapingDownloaderMiddleware:
60+
# Not all methods need to be defined. If a method is not defined,
61+
# scrapy acts as if the downloader middleware does not modify the
62+
# passed objects.
63+
64+
@classmethod
65+
def from_crawler(cls, crawler):
66+
# This method is used by Scrapy to create your spiders.
67+
s = cls()
68+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69+
return s
70+
71+
def process_request(self, request, spider):
72+
# Called for each request that goes through the downloader
73+
# middleware.
74+
75+
# Must either:
76+
# - return None: continue processing this request
77+
# - or return a Response object
78+
# - or return a Request object
79+
# - or raise IgnoreRequest: process_exception() methods of
80+
# installed downloader middleware will be called
81+
return None
82+
83+
def process_response(self, request, response, spider):
84+
# Called with the response returned from the downloader.
85+
86+
# Must either;
87+
# - return a Response object
88+
# - return a Request object
89+
# - or raise IgnoreRequest
90+
return response
91+
92+
def process_exception(self, request, exception, spider):
93+
# Called when a download handler or a process_request()
94+
# (from other downloader middleware) raises an exception.
95+
96+
# Must either:
97+
# - return None: continue processing this exception
98+
# - return a Response object: stops process_exception() chain
99+
# - return a Request object: stops process_exception() chain
100+
pass
101+
102+
def spider_opened(self, spider):
103+
spider.logger.info("Spider opened: %s" % spider.name)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Define your item pipelines here
2+
#
3+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
4+
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5+
6+
7+
# useful for handling different item types with a single interface
8+
from itemadapter import ItemAdapter
9+
10+
11+
class ScrapyScrapingPipeline:
12+
def process_item(self, item, spider):
13+
return item
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# Scrapy settings for scrapy_scraping project
2+
#
3+
# For simplicity, this file contains only settings considered important or
4+
# commonly used. You can find more settings consulting the documentation:
5+
#
6+
# https://docs.scrapy.org/en/latest/topics/settings.html
7+
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9+
10+
BOT_NAME = "scrapy_scraping"
11+
12+
SPIDER_MODULES = ["scrapy_scraping.spiders"]
13+
NEWSPIDER_MODULE = "scrapy_scraping.spiders"
14+
15+
16+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
17+
#USER_AGENT = "scrapy_scraping (+http://www.yourdomain.com)"
18+
19+
# Obey robots.txt rules
20+
ROBOTSTXT_OBEY = True
21+
22+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
23+
#CONCURRENT_REQUESTS = 32
24+
25+
# Configure a delay for requests for the same website (default: 0)
26+
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27+
# See also autothrottle settings and docs
28+
#DOWNLOAD_DELAY = 3
29+
# The download delay setting will honor only one of:
30+
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
31+
#CONCURRENT_REQUESTS_PER_IP = 16
32+
33+
# Disable cookies (enabled by default)
34+
#COOKIES_ENABLED = False
35+
36+
# Disable Telnet Console (enabled by default)
37+
#TELNETCONSOLE_ENABLED = False
38+
39+
# Override the default request headers:
40+
#DEFAULT_REQUEST_HEADERS = {
41+
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
42+
# "Accept-Language": "en",
43+
#}
44+
45+
# Enable or disable spider middlewares
46+
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
47+
#SPIDER_MIDDLEWARES = {
48+
# "scrapy_scraping.middlewares.ScrapyScrapingSpiderMiddleware": 543,
49+
#}
50+
51+
# Enable or disable downloader middlewares
52+
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
53+
#DOWNLOADER_MIDDLEWARES = {
54+
# "scrapy_scraping.middlewares.ScrapyScrapingDownloaderMiddleware": 543,
55+
#}
56+
57+
# Enable or disable extensions
58+
# See https://docs.scrapy.org/en/latest/topics/extensions.html
59+
#EXTENSIONS = {
60+
# "scrapy.extensions.telnet.TelnetConsole": None,
61+
#}
62+
63+
# Configure item pipelines
64+
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
65+
#ITEM_PIPELINES = {
66+
# "scrapy_scraping.pipelines.ScrapyScrapingPipeline": 300,
67+
#}
68+
69+
# Enable and configure the AutoThrottle extension (disabled by default)
70+
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
71+
#AUTOTHROTTLE_ENABLED = True
72+
# The initial download delay
73+
#AUTOTHROTTLE_START_DELAY = 5
74+
# The maximum download delay to be set in case of high latencies
75+
#AUTOTHROTTLE_MAX_DELAY = 60
76+
# The average number of requests Scrapy should be sending in parallel to
77+
# each remote server
78+
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79+
# Enable showing throttling stats for every response received:
80+
#AUTOTHROTTLE_DEBUG = False
81+
82+
# Enable and configure HTTP caching (disabled by default)
83+
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84+
#HTTPCACHE_ENABLED = True
85+
#HTTPCACHE_EXPIRATION_SECS = 0
86+
#HTTPCACHE_DIR = "httpcache"
87+
#HTTPCACHE_IGNORE_HTTP_CODES = []
88+
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
89+
90+
# Set settings whose default value is deprecated to a future-proof value
91+
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
92+
FEED_EXPORT_ENCODING = "utf-8"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.
Binary file not shown.
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import scrapy
2+
3+
4+
class WikipediaSpider(scrapy.Spider):
5+
name = "wikipedia"
6+
allowed_domains = ["en.wikipedia.org"]
7+
start_urls = ["https://en.wikipedia.org/wiki/Web_scraping"]
8+
9+
def parse(self, response):
10+
# List to store the titles
11+
titles = []
12+
13+
# List of header levels (h1, h2, h3, h4, h5)
14+
title_level_list = [1, 2, 3, 4, 5]
15+
16+
# Loop through each header level (h1, h2, h3, h4, h5)
17+
for title_level in title_level_list:
18+
# Find all elements of the current header level
19+
title_elements = response.css(f"h{title_level}")
20+
21+
# Loop through each title element found
22+
for title_element in title_elements:
23+
# Extract tag and text
24+
tag = title_element.root.tag
25+
text = title_element.css("::text").get().strip()
26+
27+
# Yield the data directly to the feed
28+
yield {
29+
"tag": tag,
30+
"title": text,
31+
}

scrapy_scraping/titles.csv

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
tag,title
2+
h1,Web scraping
3+
h2,Contents
4+
h2,History
5+
h2,Techniques
6+
h2,Legal issues
7+
h2,Methods to prevent web scraping
8+
h2,See also
9+
h2,References
10+
h3,Human copy-and-paste
11+
h3,Text pattern matching
12+
h3,HTTP programming
13+
h3,HTML parsing
14+
h3,DOM parsing
15+
h3,Vertical aggregation
16+
h3,Semantic annotation recognizing
17+
h3,Computer vision web-page analysis
18+
h3,AI-powered document understanding
19+
h3,United States
20+
h3,European Union
21+
h3,Australia
22+
h3,India

0 commit comments

Comments
 (0)