import requests import pandas as pd import json from datetime import datetime def estimate_tokens(length): # Rough estimation: 1 token ≈ 4 characters return int(length / 4) def get_domain_data(domain): url = f"https://index.commoncrawl.org/CC-MAIN-2023-14-index?url=*.{domain}&output=json" response = requests.get(url) pages = [] total_tokens = 0 for line in response.iter_lines(): if line: page_data = json.loads(line) timestamp = page_data['timestamp'] formatted_date = datetime.strptime(timestamp, "%Y%m%d%H%M%S").strftime("%m/%d/%Y") length = int(page_data.get('length', 0)) tokens = estimate_tokens(length) total_tokens += tokens pages.append({ 'domain': domain, 'url': page_data['url'], 'date': formatted_date, 'timestamp': timestamp, 'mime': page_data.get('mime', 'N/A'), 'status': page_data.get('status', 'N/A'), 'languages': page_data.get('languages', 'N/A'), 'tokens': tokens }) return pages, total_tokens def main(): print("Welcome to the Common Crawl Domain Checker with Token Estimation!") domains_input = input("Please enter a list of domains, separated by commas: ") domains = [domain.strip() for domain in domains_input.split(',')] all_pages = [] domain_tokens = {} total_tokens = 0 for domain in domains: print(f"Fetching data for {domain}...") domain_pages, domain_total_tokens = get_domain_data(domain) all_pages.extend(domain_pages) domain_tokens[domain] = domain_total_tokens total_tokens += domain_total_tokens print(f"Found {len(domain_pages)} pages and estimated {domain_total_tokens} tokens for {domain}") df = pd.DataFrame(all_pages) # Calculate percentages for domain in domains: percentage = (domain_tokens[domain] / total_tokens) * 100 if total_tokens > 0 else 0 domain_tokens[domain] = (domain_tokens[domain], percentage) # Generate timestamp for filename timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") csv_filename = f"common_crawl_results_{timestamp}.csv" # Save to CSV df.to_csv(csv_filename, index=False) print(f"\nDetailed results saved to {csv_filename}") # Create and save summary DataFrame summary_data = [] for domain, (tokens, percentage) in domain_tokens.items(): page_count = df[df['domain'] == domain]['url'].count() earliest_crawl = df[df['domain'] == domain]['date'].min() latest_crawl = df[df['domain'] == domain]['date'].max() summary_data.append({ 'Domain': domain, 'Pages Crawled': page_count, 'Estimated Tokens': tokens, 'Token Percentage': f"{percentage:.2f}%", 'Earliest Crawl': earliest_crawl, 'Latest Crawl': latest_crawl }) summary_df = pd.DataFrame(summary_data) summary_csv = f"common_crawl_summary_{timestamp}.csv" summary_df.to_csv(summary_csv, index=False) print(f"Summary results saved to {summary_csv}") # Display summary print("\nSummary:") print(summary_df.to_string(index=False)) if __name__ == "__main__": main()