Python Code for Text Mining> Keyword Frequency> From Webpages > Web Scrapping
5 min readSep 17, 2023
Here are python code(s) for following 3 scenarios:
1. Text_Mining_Word_Frequencey_from__all_the_associated_pages
pip install requests beautifulsoup4
Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (2.31.0)
Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (4.11.2)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests) (3.2.0)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests) (3.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests) (2.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests) (2023.7.22)
Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4) (2.5)
import requests
from bs4 import BeautifulSoup
from collections import Counter
from urllib.parse import urljoin
# Define the URL of the website you want to scrape
base_url = 'https://www.washingtonpost.com/'
start_url = base_url # Starting URL
# Define the specific words you want to count
specific_words = ['hunter', 'brand']
# Function to extract text and word frequency from a URL
def extract_word_frequency(url):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()
words = text.split()
words = [word.lower() for word in words]
word_frequency = Counter(words)
return word_frequency
else:
return Counter() # Return an empty Counter if the page can't be accessed
# Function to recursively crawl and count words on the website
def crawl_website(url, word_frequencies):
visited_urls = set() # Track visited URLs to avoid duplicates
def recursive_crawl(url):
if url in visited_urls:
return
visited_urls.add(url)
# Extract word frequency from the current page
word_frequency = extract_word_frequency(url)
# Store word frequency for the current page in the dictionary
word_frequencies[url] = word_frequency
# Print word frequency for the current page
print(f'URL: {url}')
for word in specific_words:
print(f'The word "{word}" appears {word_frequency[word.lower()]} times on this page.')
# Find and follow links on the current page
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
for link in soup.find_all('a', href=True):
absolute_link = urljoin(url, link['href'])
if base_url in absolute_link: # Check if the link is within the same website
recursive_crawl(absolute_link)
recursive_crawl(url)
# Initialize a dictionary to store word frequencies for each page
word_frequencies = {}
# Start crawling from the initial URL
crawl_website(start_url, word_frequencies)
# Print word frequency totals across all pages
print("\nWord Frequency Totals Across All Pages:")
for url, word_frequency in word_frequencies.items():
print(f'URL: {url}')
for word in specific_words:
print(f'Total "{word}" frequency: {word_frequency[word.lower()]}')
URL: https://www.washingtonpost.com/
The word "hunter" appears 2 times on this page.
The word "brand" appears 2 times on this page.
URL: https://www.washingtonpost.com/accessibility
The word "hunter" appears 0 times on this page.
The word "brand" appears 0 times on this page.
URL: https://www.washingtonpost.com/accessibility#main-content
The word "hunter" appears 0 times on this page.
The word "brand" appears 0 times on this page.
# 2. Text_Mining_Word_Frequencey_from__all_the_associated_pages.ipynb
pip install requests beautifulsoup4
# import libraries
import requests
from bs4 import BeautifulSoup
from collections import Counter
# Define the URL of the website you want to scrape
url = 'https://www.washingtonpost.com/'
# Define the specific words you want to count
specific_words = ['food', 'industry']
# Send an HTTP GET request to the URL
response = requests.get(url)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')
# Extract text content from the HTML
text = soup.get_text()
# Tokenize the text (split it into words)
words = text.split()
# Convert all words to lowercase for case-insensitive counting
words = [word.lower() for word in words]
# Calculate the frequency of specific words
word_frequency = Counter(words)
# Print the frequency of specific words
for word in specific_words:
print(f'The word "{word}" appears {word_frequency[word.lower()]} times.')
else:
print(f'Failed to retrieve the webpage. Status code: {response.status_code}')
3. Text_Mining_Word_Frequencey_data_uploaded_from_Excel
from google.colab import files
# Upload the Excel files containing URLs and keywords
uploaded_urls = files.upload() # Upload the URLs Excel file
uploaded_keywords = files.upload() # Upload the Keywords Excel file
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving test_websites.xlsx to test_websites.xlsx
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving test_keywords.xlsx to test_keywords.xlsx
import pandas as pd
# Load the uploaded Excel files into DataFrames
df_urls = pd.read_excel('test_websites.xlsx') # Replace with the actual file name
df_keywords = pd.read_excel('test_keywords.xlsx') # Replace with the actual file name
print(df_urls.head())
URL
0 https://www.washingtonpost.com/
1 https://www.nytimes.com/
print(df_keywords.head())
Keyword
0 hunt
1 operations
2 down
3 food
# Extract URLs and keywords from DataFrames
websites_list = df_urls['URL'].tolist() # Adjust the column name as needed
specific_words = df_keywords['Keyword'].tolist() # Adjust the column name as needed
print(df_keywords.head())
Keyword
0 hunt
1 operations
2 down
3 food
import requests
from bs4 import BeautifulSoup
from collections import Counter
from urllib.parse import urljoin
# Function to extract text and word frequency from a URL
def extract_word_frequency(url):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()
words = text.split()
words = [word.lower() for word in words]
word_frequency = Counter(words)
return word_frequency
else:
return Counter() # Return an empty Counter if the page can't be accessed
# Function to extract word frequencies from all associated pages of a website
def extract_word_frequencies_for_website(website_url):
visited_urls = set() # Track visited URLs to avoid duplicates
def recursive_crawl(url, word_frequency_total):
if url in visited_urls:
return word_frequency_total
visited_urls.add(url)
# Extract word frequency from the current page
word_frequency = extract_word_frequency(url)
word_frequency_total += word_frequency
# Print word frequency for the current page
print(f'URL: {url}')
for word in specific_words:
print(f'The word "{word}" appears {word_frequency[word.lower()]} times on this page.')
# Find and follow links on the current page
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
for link in soup.find_all('a', href=True):
absolute_link = urljoin(url, link['href'])
if website_url in absolute_link: # Check if the link is within the same website
word_frequency_total = recursive_crawl(absolute_link, word_frequency_total)
return word_frequency_total
word_frequency_total = Counter() # Initialize word_frequency_total
word_frequency_total = recursive_crawl(website_url, word_frequency_total)
# Print word frequency totals across all pages of the website
print("\nWord Frequency Totals Across All Pages of the Website:")
for word in specific_words:
print(f'Total "{word}" frequency: {word_frequency_total[word.lower()]}')
# Extract word frequencies for all websites
for website_url in websites_list:
print(f"\nExtracting Word Frequencies for Website: {website_url}\n")
extract_word_frequencies_for_website(website_url)
Extracting Word Frequencies for Website: https://www.washingtonpost.com/
URL: https://www.washingtonpost.com/
The word "hunt" appears 0 times on this page.
The word "operations" appears 0 times on this page.
The word "down" appears 4 times on this page.
The word "food" appears 2 times on this page.
URL: https://www.washingtonpost.com/accessibility
The word "hunt" appears 0 times on this page.
The word "operations" appears 0 times on this page.
The word "down" appears 0 times on this page.
The word "food" appears 0 times on this page.