Python Code for Text Mining> Keyword Frequency> From Webpages > Web Scrapping

Ray Islam, PhD
5 min readSep 17, 2023

--

Source: freepik.com

Here are python code(s) for following 3 scenarios:

1. Text_Mining_Word_Frequencey_from__all_the_associated_pages



pip install requests beautifulsoup4

Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (2.31.0)
Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (4.11.2)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests) (3.2.0)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests) (3.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests) (2.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests) (2023.7.22)
Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4) (2.5)

import requests
from bs4 import BeautifulSoup
from collections import Counter
from urllib.parse import urljoin

# Define the URL of the website you want to scrape
base_url = 'https://www.washingtonpost.com/'
start_url = base_url # Starting URL

# Define the specific words you want to count
specific_words = ['hunter', 'brand']

# Function to extract text and word frequency from a URL
def extract_word_frequency(url):
response = requests.get(url)

if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()
words = text.split()
words = [word.lower() for word in words]
word_frequency = Counter(words)
return word_frequency
else:
return Counter() # Return an empty Counter if the page can't be accessed



# Function to recursively crawl and count words on the website
def crawl_website(url, word_frequencies):
visited_urls = set() # Track visited URLs to avoid duplicates

def recursive_crawl(url):
if url in visited_urls:
return
visited_urls.add(url)

# Extract word frequency from the current page
word_frequency = extract_word_frequency(url)

# Store word frequency for the current page in the dictionary
word_frequencies[url] = word_frequency

# Print word frequency for the current page
print(f'URL: {url}')
for word in specific_words:
print(f'The word "{word}" appears {word_frequency[word.lower()]} times on this page.')

# Find and follow links on the current page
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
for link in soup.find_all('a', href=True):
absolute_link = urljoin(url, link['href'])
if base_url in absolute_link: # Check if the link is within the same website
recursive_crawl(absolute_link)

recursive_crawl(url)



# Initialize a dictionary to store word frequencies for each page
word_frequencies = {}

# Start crawling from the initial URL
crawl_website(start_url, word_frequencies)

# Print word frequency totals across all pages
print("\nWord Frequency Totals Across All Pages:")
for url, word_frequency in word_frequencies.items():
print(f'URL: {url}')
for word in specific_words:
print(f'Total "{word}" frequency: {word_frequency[word.lower()]}')


URL: https://www.washingtonpost.com/
The word "hunter" appears 2 times on this page.
The word "brand" appears 2 times on this page.
URL: https://www.washingtonpost.com/accessibility
The word "hunter" appears 0 times on this page.
The word "brand" appears 0 times on this page.
URL: https://www.washingtonpost.com/accessibility#main-content
The word "hunter" appears 0 times on this page.
The word "brand" appears 0 times on this page.

# 2. Text_Mining_Word_Frequencey_from__all_the_associated_pages.ipynb

pip install requests beautifulsoup4

# import libraries
import requests
from bs4 import BeautifulSoup
from collections import Counter

# Define the URL of the website you want to scrape
url = 'https://www.washingtonpost.com/'

# Define the specific words you want to count
specific_words = ['food', 'industry']

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')

# Extract text content from the HTML
text = soup.get_text()

# Tokenize the text (split it into words)
words = text.split()

# Convert all words to lowercase for case-insensitive counting
words = [word.lower() for word in words]

# Calculate the frequency of specific words
word_frequency = Counter(words)

# Print the frequency of specific words
for word in specific_words:
print(f'The word "{word}" appears {word_frequency[word.lower()]} times.')

else:
print(f'Failed to retrieve the webpage. Status code: {response.status_code}')

3. Text_Mining_Word_Frequencey_data_uploaded_from_Excel

from google.colab import files

# Upload the Excel files containing URLs and keywords
uploaded_urls = files.upload() # Upload the URLs Excel file
uploaded_keywords = files.upload() # Upload the Keywords Excel file

Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving test_websites.xlsx to test_websites.xlsx
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving test_keywords.xlsx to test_keywords.xlsx

import pandas as pd

# Load the uploaded Excel files into DataFrames
df_urls = pd.read_excel('test_websites.xlsx') # Replace with the actual file name
df_keywords = pd.read_excel('test_keywords.xlsx') # Replace with the actual file name

print(df_urls.head())

URL
0 https://www.washingtonpost.com/
1 https://www.nytimes.com/

print(df_keywords.head())

Keyword
0 hunt
1 operations
2 down
3 food

# Extract URLs and keywords from DataFrames
websites_list = df_urls['URL'].tolist() # Adjust the column name as needed
specific_words = df_keywords['Keyword'].tolist() # Adjust the column name as needed


print(df_keywords.head())

Keyword
0 hunt
1 operations
2 down
3 food

import requests
from bs4 import BeautifulSoup
from collections import Counter
from urllib.parse import urljoin

# Function to extract text and word frequency from a URL
def extract_word_frequency(url):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()
words = text.split()
words = [word.lower() for word in words]
word_frequency = Counter(words)
return word_frequency
else:
return Counter() # Return an empty Counter if the page can't be accessed



# Function to extract word frequencies from all associated pages of a website
def extract_word_frequencies_for_website(website_url):
visited_urls = set() # Track visited URLs to avoid duplicates

def recursive_crawl(url, word_frequency_total):
if url in visited_urls:
return word_frequency_total
visited_urls.add(url)

# Extract word frequency from the current page
word_frequency = extract_word_frequency(url)
word_frequency_total += word_frequency

# Print word frequency for the current page
print(f'URL: {url}')
for word in specific_words:
print(f'The word "{word}" appears {word_frequency[word.lower()]} times on this page.')

# Find and follow links on the current page
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
for link in soup.find_all('a', href=True):
absolute_link = urljoin(url, link['href'])
if website_url in absolute_link: # Check if the link is within the same website
word_frequency_total = recursive_crawl(absolute_link, word_frequency_total)

return word_frequency_total

word_frequency_total = Counter() # Initialize word_frequency_total
word_frequency_total = recursive_crawl(website_url, word_frequency_total)

# Print word frequency totals across all pages of the website
print("\nWord Frequency Totals Across All Pages of the Website:")
for word in specific_words:
print(f'Total "{word}" frequency: {word_frequency_total[word.lower()]}')

# Extract word frequencies for all websites
for website_url in websites_list:
print(f"\nExtracting Word Frequencies for Website: {website_url}\n")
extract_word_frequencies_for_website(website_url)

Extracting Word Frequencies for Website: https://www.washingtonpost.com/

URL: https://www.washingtonpost.com/
The word "hunt" appears 0 times on this page.
The word "operations" appears 0 times on this page.
The word "down" appears 4 times on this page.
The word "food" appears 2 times on this page.
URL: https://www.washingtonpost.com/accessibility
The word "hunt" appears 0 times on this page.
The word "operations" appears 0 times on this page.
The word "down" appears 0 times on this page.
The word "food" appears 0 times on this page.

--

--

Ray Islam, PhD
Ray Islam, PhD

Written by Ray Islam, PhD

PhD in ML | AI Scientist | Professor | Author | Speaker | Reviewer: ICLR; RESS; JPHM | Member: AAAI | Marquis Who's Who | PhD | MASc | MSc | MBA | BSc. Eng.

No responses yet