109 lines
4.0 KiB
Python
109 lines
4.0 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import csv
|
|
from datetime import datetime
|
|
import time
|
|
|
|
def scrape_cyberparks():
|
|
"""Scrape company information from CyberParks website"""
|
|
|
|
url = "https://cyberparks.in/companies-at-park/"
|
|
|
|
print(f"Fetching data from {url}...")
|
|
|
|
try:
|
|
# Send GET request
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
}
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
# Parse HTML
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
# Find all company entries
|
|
companies = []
|
|
|
|
# Look for company information (adjust selectors based on actual HTML structure)
|
|
company_sections = soup.find_all(['div', 'article', 'section'], class_=lambda x: x and ('company' in x.lower() or 'list' in x.lower()))
|
|
|
|
if not company_sections:
|
|
# Try alternative approach - find all text blocks
|
|
company_sections = soup.find_all(['div', 'p', 'li'])
|
|
|
|
print(f"Found {len(company_sections)} potential company entries...")
|
|
|
|
for section in company_sections:
|
|
text = section.get_text(strip=True)
|
|
|
|
# Skip empty or very short entries
|
|
if len(text) < 10:
|
|
continue
|
|
|
|
# Extract links
|
|
links = section.find_all('a', href=True)
|
|
company_url = links[0]['href'] if links else ''
|
|
|
|
# Basic extraction logic (you may need to adjust based on actual structure)
|
|
company_data = {
|
|
'Company Name': '',
|
|
'URL': company_url,
|
|
'CEO/Chairman': ''
|
|
}
|
|
|
|
# Try to extract company name (usually in bold, heading, or first line)
|
|
name_tag = section.find(['strong', 'b', 'h1', 'h2', 'h3', 'h4'])
|
|
if name_tag:
|
|
company_data['Company Name'] = name_tag.get_text(strip=True)
|
|
else:
|
|
# Take first line as company name
|
|
lines = text.split('\n')
|
|
company_data['Company Name'] = lines[0] if lines else text[:50]
|
|
|
|
# Look for CEO/Chairman keywords
|
|
if any(keyword in text.lower() for keyword in ['ceo', 'chairman', 'director', 'founder']):
|
|
for line in text.split('\n'):
|
|
if any(keyword in line.lower() for keyword in ['ceo', 'chairman', 'director', 'founder']):
|
|
company_data['CEO/Chairman'] = line.strip()
|
|
break
|
|
|
|
if company_data['Company Name']:
|
|
companies.append(company_data)
|
|
|
|
# Create CSV file
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
filename = f"cyberparks_companies_{timestamp}.csv"
|
|
|
|
with open(filename, 'w', newline='', encoding='utf-8') as f:
|
|
if companies:
|
|
fieldnames = ['Company Name', 'URL', 'CEO/Chairman']
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
|
|
writer.writeheader()
|
|
writer.writerows(companies)
|
|
|
|
print(f"\n✓ Successfully scraped {len(companies)} companies!")
|
|
print(f"✓ Data saved to: {filename}")
|
|
else:
|
|
print("\n✗ No companies found. The website structure might have changed.")
|
|
print("Please check the website manually and adjust the scraping logic.")
|
|
|
|
return filename
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"\n✗ Error fetching the website: {e}")
|
|
return None
|
|
except Exception as e:
|
|
print(f"\n✗ Error during scraping: {e}")
|
|
return None
|
|
|
|
if __name__ == "__main__":
|
|
print("=" * 60)
|
|
print("CyberParks Company Information Scraper")
|
|
print("=" * 60)
|
|
|
|
scrape_cyberparks()
|
|
|
|
print("\nScraping completed!")
|