cyberparks_details/scraper.py

109 lines
4.0 KiB
Python

import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import time
def scrape_cyberparks():
"""Scrape company information from CyberParks website"""
url = "https://cyberparks.in/companies-at-park/"
print(f"Fetching data from {url}...")
try:
# Send GET request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
# Parse HTML
soup = BeautifulSoup(response.content, 'html.parser')
# Find all company entries
companies = []
# Look for company information (adjust selectors based on actual HTML structure)
company_sections = soup.find_all(['div', 'article', 'section'], class_=lambda x: x and ('company' in x.lower() or 'list' in x.lower()))
if not company_sections:
# Try alternative approach - find all text blocks
company_sections = soup.find_all(['div', 'p', 'li'])
print(f"Found {len(company_sections)} potential company entries...")
for section in company_sections:
text = section.get_text(strip=True)
# Skip empty or very short entries
if len(text) < 10:
continue
# Extract links
links = section.find_all('a', href=True)
company_url = links[0]['href'] if links else ''
# Basic extraction logic (you may need to adjust based on actual structure)
company_data = {
'Company Name': '',
'URL': company_url,
'CEO/Chairman': ''
}
# Try to extract company name (usually in bold, heading, or first line)
name_tag = section.find(['strong', 'b', 'h1', 'h2', 'h3', 'h4'])
if name_tag:
company_data['Company Name'] = name_tag.get_text(strip=True)
else:
# Take first line as company name
lines = text.split('\n')
company_data['Company Name'] = lines[0] if lines else text[:50]
# Look for CEO/Chairman keywords
if any(keyword in text.lower() for keyword in ['ceo', 'chairman', 'director', 'founder']):
for line in text.split('\n'):
if any(keyword in line.lower() for keyword in ['ceo', 'chairman', 'director', 'founder']):
company_data['CEO/Chairman'] = line.strip()
break
if company_data['Company Name']:
companies.append(company_data)
# Create CSV file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"cyberparks_companies_{timestamp}.csv"
with open(filename, 'w', newline='', encoding='utf-8') as f:
if companies:
fieldnames = ['Company Name', 'URL', 'CEO/Chairman']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(companies)
print(f"\n✓ Successfully scraped {len(companies)} companies!")
print(f"✓ Data saved to: {filename}")
else:
print("\n✗ No companies found. The website structure might have changed.")
print("Please check the website manually and adjust the scraping logic.")
return filename
except requests.exceptions.RequestException as e:
print(f"\n✗ Error fetching the website: {e}")
return None
except Exception as e:
print(f"\n✗ Error during scraping: {e}")
return None
if __name__ == "__main__":
print("=" * 60)
print("CyberParks Company Information Scraper")
print("=" * 60)
scrape_cyberparks()
print("\nScraping completed!")