cyberparks_details/scraper.py

import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import time

def scrape_cyberparks():
    """Scrape company information from CyberParks website"""

    url = "https://cyberparks.in/companies-at-park/"

    print(f"Fetching data from {url}...")

    try:
        # Send GET request
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        # Parse HTML
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all company entries
        companies = []

        # Look for company information (adjust selectors based on actual HTML structure)
        company_sections = soup.find_all(['div', 'article', 'section'], class_=lambda x: x and ('company' in x.lower() or 'list' in x.lower()))

        if not company_sections:
            # Try alternative approach - find all text blocks
            company_sections = soup.find_all(['div', 'p', 'li'])

        print(f"Found {len(company_sections)} potential company entries...")

        for section in company_sections:
            text = section.get_text(strip=True)

            # Skip empty or very short entries
            if len(text) < 10:
                continue

            # Extract links
            links = section.find_all('a', href=True)
            company_url = links[0]['href'] if links else ''

            # Basic extraction logic (you may need to adjust based on actual structure)
            company_data = {
                'Company Name': '',
                'URL': company_url,
                'CEO/Chairman': ''
            }

            # Try to extract company name (usually in bold, heading, or first line)
            name_tag = section.find(['strong', 'b', 'h1', 'h2', 'h3', 'h4'])
            if name_tag:
                company_data['Company Name'] = name_tag.get_text(strip=True)
            else:
                # Take first line as company name
                lines = text.split('\n')
                company_data['Company Name'] = lines[0] if lines else text[:50]

            # Look for CEO/Chairman keywords
            if any(keyword in text.lower() for keyword in ['ceo', 'chairman', 'director', 'founder']):
                for line in text.split('\n'):
                    if any(keyword in line.lower() for keyword in ['ceo', 'chairman', 'director', 'founder']):
                        company_data['CEO/Chairman'] = line.strip()
                        break

            if company_data['Company Name']:
                companies.append(company_data)

        # Create CSV file
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"cyberparks_companies_{timestamp}.csv"

        with open(filename, 'w', newline='', encoding='utf-8') as f:
            if companies:
                fieldnames = ['Company Name', 'URL', 'CEO/Chairman']
                writer = csv.DictWriter(f, fieldnames=fieldnames)

                writer.writeheader()
                writer.writerows(companies)

                print(f"\n✓ Successfully scraped {len(companies)} companies!")
                print(f"✓ Data saved to: {filename}")
            else:
                print("\n✗ No companies found. The website structure might have changed.")
                print("Please check the website manually and adjust the scraping logic.")

        return filename

    except requests.exceptions.RequestException as e:
        print(f"\n✗ Error fetching the website: {e}")
        return None
    except Exception as e:
        print(f"\n✗ Error during scraping: {e}")
        return None

if __name__ == "__main__":
    print("=" * 60)
    print("CyberParks Company Information Scraper")
    print("=" * 60)

    scrape_cyberparks()

    print("\nScraping completed!")