Python Code for Text Mining> Keyword Frequency> From Webpages > Web Scrapping

5 min readSep 17, 2023

Here are python code(s) for following 3 scenarios:

1. Text_Mining_Word_Frequencey_from__all_the_associated_pages



pip install requests beautifulsoup4
     
Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (2.31.0)
Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (4.11.2)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests) (3.2.0)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests) (3.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests) (2.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests) (2023.7.22)
Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4) (2.5)

import requests
from bs4 import BeautifulSoup
from collections import Counter
from urllib.parse import urljoin

# Define the URL of the website you want to scrape
base_url = 'https://www.washingtonpost.com/'
start_url = base_url  # Starting URL

# Define the specific words you want to count
specific_words = ['hunter', 'brand']

# Function to extract text and word frequency from a URL
def extract_word_frequency(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text()
        words = text.split()
        words = [word.lower() for word in words]
        word_frequency = Counter(words)
        return word_frequency
    else:
        return Counter()  # Return an empty Counter if the page can't be accessed

     

# Function to recursively crawl and count words on the website
def crawl_website(url, word_frequencies):
    visited_urls = set()  # Track visited URLs to avoid duplicates

    def recursive_crawl(url):
        if url in visited_urls:
            return
        visited_urls.add(url)

        # Extract word frequency from the current page
        word_frequency = extract_word_frequency(url)

        # Store word frequency for the current page in the dictionary
        word_frequencies[url] = word_frequency

        # Print word frequency for the current page
        print(f'URL: {url}')
        for word in specific_words:
            print(f'The word "{word}" appears {word_frequency[word.lower()]} times on this page.')

        # Find and follow links on the current page
        soup = BeautifulSoup(requests.get(url).text, 'html.parser')
        for link in soup.find_all('a', href=True):
            absolute_link = urljoin(url, link['href'])
            if base_url in absolute_link:  # Check if the link is within the same website
                recursive_crawl(absolute_link)

    recursive_crawl(url)

     

# Initialize a dictionary to store word frequencies for each page
word_frequencies = {}

# Start crawling from the initial URL
crawl_website(start_url, word_frequencies)

# Print word frequency totals across all pages
print("\nWord Frequency Totals Across All Pages:")
for url, word_frequency in word_frequencies.items():
    print(f'URL: {url}')
    for word in specific_words:
        print(f'Total "{word}" frequency: {word_frequency[word.lower()]}')

     
URL: https://www.washingtonpost.com/
The word "hunter" appears 2 times on this page.
The word "brand" appears 2 times on this page.
URL: https://www.washingtonpost.com/accessibility
The word "hunter" appears 0 times on this page.
The word "brand" appears 0 times on this page.
URL: https://www.washingtonpost.com/accessibility#main-content
The word "hunter" appears 0 times on this page.
The word "brand" appears 0 times on this page.


# 2. Text_Mining_Word_Frequencey_from__all_the_associated_pages.ipynb

pip install requests beautifulsoup4
     
# import libraries
import requests
from bs4 import BeautifulSoup
from collections import Counter

# Define the URL of the website you want to scrape
url = 'https://www.washingtonpost.com/'

# Define the specific words you want to count
specific_words = ['food', 'industry']

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract text content from the HTML
    text = soup.get_text()

    # Tokenize the text (split it into words)
    words = text.split()

    # Convert all words to lowercase for case-insensitive counting
    words = [word.lower() for word in words]

    # Calculate the frequency of specific words
    word_frequency = Counter(words)

    # Print the frequency of specific words
    for word in specific_words:
        print(f'The word "{word}" appears {word_frequency[word.lower()]} times.')

else:
    print(f'Failed to retrieve the webpage. Status code: {response.status_code}')

3. Text_Mining_Word_Frequencey_data_uploaded_from_Excel

from google.colab import files

# Upload the Excel files containing URLs and keywords
uploaded_urls = files.upload()  # Upload the URLs Excel file
uploaded_keywords = files.upload()  # Upload the Keywords Excel file
     
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving test_websites.xlsx to test_websites.xlsx
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving test_keywords.xlsx to test_keywords.xlsx

import pandas as pd

# Load the uploaded Excel files into DataFrames
df_urls = pd.read_excel('test_websites.xlsx')  # Replace with the actual file name
df_keywords = pd.read_excel('test_keywords.xlsx')  # Replace with the actual file name
  
print(df_urls.head())
    
                               URL
0  https://www.washingtonpost.com/
1         https://www.nytimes.com/

print(df_keywords.head())
     
      Keyword
0        hunt
1  operations
2        down
3        food

# Extract URLs and keywords from DataFrames
websites_list = df_urls['URL'].tolist()  # Adjust the column name as needed
specific_words = df_keywords['Keyword'].tolist()  # Adjust the column name as needed

   
print(df_keywords.head())
     
      Keyword
0        hunt
1  operations
2        down
3        food

import requests
from bs4 import BeautifulSoup
from collections import Counter
from urllib.parse import urljoin

# Function to extract text and word frequency from a URL
def extract_word_frequency(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text()
        words = text.split()
        words = [word.lower() for word in words]
        word_frequency = Counter(words)
        return word_frequency
    else:
        return Counter()  # Return an empty Counter if the page can't be accessed

     

# Function to extract word frequencies from all associated pages of a website
def extract_word_frequencies_for_website(website_url):
    visited_urls = set()  # Track visited URLs to avoid duplicates

    def recursive_crawl(url, word_frequency_total):
        if url in visited_urls:
            return word_frequency_total
        visited_urls.add(url)

        # Extract word frequency from the current page
        word_frequency = extract_word_frequency(url)
        word_frequency_total += word_frequency

        # Print word frequency for the current page
        print(f'URL: {url}')
        for word in specific_words:
            print(f'The word "{word}" appears {word_frequency[word.lower()]} times on this page.')

        # Find and follow links on the current page
        soup = BeautifulSoup(requests.get(url).text, 'html.parser')
        for link in soup.find_all('a', href=True):
            absolute_link = urljoin(url, link['href'])
            if website_url in absolute_link:  # Check if the link is within the same website
                word_frequency_total = recursive_crawl(absolute_link, word_frequency_total)

        return word_frequency_total

    word_frequency_total = Counter()  # Initialize word_frequency_total
    word_frequency_total = recursive_crawl(website_url, word_frequency_total)

    # Print word frequency totals across all pages of the website
    print("\nWord Frequency Totals Across All Pages of the Website:")
    for word in specific_words:
        print(f'Total "{word}" frequency: {word_frequency_total[word.lower()]}')

# Extract word frequencies for all websites
for website_url in websites_list:
    print(f"\nExtracting Word Frequencies for Website: {website_url}\n")
    extract_word_frequencies_for_website(website_url)
     
Extracting Word Frequencies for Website: https://www.washingtonpost.com/

URL: https://www.washingtonpost.com/
The word "hunt" appears 0 times on this page.
The word "operations" appears 0 times on this page.
The word "down" appears 4 times on this page.
The word "food" appears 2 times on this page.
URL: https://www.washingtonpost.com/accessibility
The word "hunt" appears 0 times on this page.
The word "operations" appears 0 times on this page.
The word "down" appears 0 times on this page.
The word "food" appears 0 times on this page.

Python Code for Text Mining> Keyword Frequency> From Webpages > Web Scrapping

Written by Ray Islam, PhD

No responses yet