### Import Libraries

In [None]:
! pip install python-dotenv pandas

In [None]:
! pip install openpyxl

In [1]:
import json
import os
import time
import re
from typing import Dict, List
from urllib.parse import urlsplit

from dotenv import load_dotenv
from openai import OpenAI
import numpy as np
import pandas as pd

In [2]:
load_dotenv()

True

In [3]:
pd.set_option('display.max_rows', 200)

In [4]:
OPENAI_API_KEY      = os.getenv("OPENAI_API_KEY")
OPENAI_LLM_MODEL    = os.getenv("OPENAI_LLM_MODEL")

PERPLEXITY_API_KEY  = os.getenv("PERPLEXITY_API_KEY")
NOVITA_AI_API_KEY   = os.getenv("NOVITA_AI_API_KEY")
NOVITA_AI_LLM_MODEL = os.getenv("NOVITA_AI_LLM_MODEL")

In [5]:
import os
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = "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"
os.environ['LANGCHAIN_PROJECT'] = "emails"
from langsmith import utils
utils.tracing_is_enabled()

True

In [7]:
client = OpenAI(api_key=OPENAI_API_KEY)


def openai_llm_call(prompt: str,
                    openai_model_name: str = OPENAI_LLM_MODEL) -> str:
    chat_completion = client.chat.completions.create(
                                                    messages=[
                                                        {
                                                            "role": "user",
                                                            "content": prompt,
                                                        }
                                                    ],
                                                    model=openai_model_name,
                                                    temperature=0.01
                                                )
    return chat_completion.choices[0].message.content.strip(), chat_completion.usage.to_dict()

# Cost of Sonar Reasoning Pro call

In [6]:
# sonar_reasoning_pro_cost.py
PRICING = {
    "low":   {"input_per_million": 2.0, "output_per_million": 8.0, "per_request": 6 / 1000},
    "medium": {"input_per_million": 2.0, "output_per_million": 8.0, "per_request": 10 / 1000},
    "high":  {"input_per_million": 2.0, "output_per_million": 8.0, "per_request": 14 / 1000},
}

def call_sonar_reasoning_pro_cost(meta: dict, model: str = "sonar-reasoning-pro") -> float:
    """
    Return the USD cost for a single API call, given the usage-metadata dictionary.
    """
    ctx = meta.get("search_context_size", "low")
    p = PRICING[ctx]                     # pick the right row

    # per-token prices → per-token multipliers
    in_price  = p["input_per_million"]  / 1_000_000
    out_price = p["output_per_million"] / 1_000_000

    prompt  = meta.get("prompt_tokens", 0)
    complet = meta.get("completion_tokens", 0)

    token_cost = prompt * in_price + complet * out_price
    request_cost = p["per_request"]

    return round(token_cost + request_cost, 6)   # dollars

if __name__ == "__main__":
    usage = {'completion_tokens': 694,
             'prompt_tokens': 37,
             'total_tokens': 731,
             'search_context_size': 'low'}

    print(f"Call price: ${call_sonar_reasoning_pro_cost(usage):.4f}")



Call price: $0.0116


# cost for sonar pro model

In [6]:
# sonar_pro_cost.py


def calc_sonar_pro_cost(meta: dict) -> float:
    """
    Return the USD cost for one Sonar Pro API call.

    Parameters
    ----------
    meta : dict
        A usage-metadata dictionary like the one Perplexity returns, e.g.
        {
            'completion_tokens': 694,
            'prompt_tokens': 37,
            'total_tokens': 731,
            'search_context_size': 'low'
        }

    Returns
    -------
    float
        Cost in US dollars (rounded to 6 decimals).
    """
    
    PRICING = {
    "low":    {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 6 / 1000},
    "medium": {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 10 / 1000},
    "high":   {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 14 / 1000},
    }
    
    
    ctx = meta.get("search_context_size", "low")
    p = PRICING[ctx]

    # convert per-million rates to per-token
    input_rate  = p["input_per_million"]  / 1_000_000
    output_rate = p["output_per_million"] / 1_000_000

    prompt_tokens     = meta.get("prompt_tokens", 0)
    completion_tokens = meta.get("completion_tokens", 0)

    token_cost   = prompt_tokens * input_rate + completion_tokens * output_rate
    request_cost = p["per_request"]

    return round(token_cost + request_cost, 6)

if __name__ == "__main__":
    usage = {
        'completion_tokens': 694,
        'prompt_tokens': 37,
        'total_tokens': 731,
        'search_context_size': 'low'
    }
    print(f"Call price: ${calc_sonar_pro_cost(usage):.4f}")


Call price: $0.0165


# Perplexity Logic Alone 

#### Perpleixty protocol instead of openai

In [8]:
import requests  # Add this import

def perplexity_call(perplexity_query: str,
                    perplexity_model: str = "sonar-reasoning-pro", #sonar-pro , #sonar-reasoning-pro
                    temperature: float = 0.01) -> tuple:
    
    url = "https://api.perplexity.ai/chat/completions"
    headers = {
        "Authorization": f"Bearer {PERPLEXITY_API_KEY}",
        "Content-Type": "application/json"
    }
    
    payload = {
        "model": perplexity_model,
        "messages": [{
            "role": "system",
            "content": "You are a specialized web crawler focused on university domains"
        }, {
            "role": "user",
            "content": perplexity_query
        }],
        "temperature": temperature,
        "search_domain_filter": ["medipol.edu.tr"],  # Use full domain
        "return_citations": True
    }

    response = requests.post(url, headers=headers, json=payload)
    
    # Add error handling
    if response.status_code != 200:
        print(f"API Error: {response.status_code}")
        print(response.text)
        return "", {}, []
    
    response_json = response.json()
    
    
    # Extract required information from response
    content = response_json['choices'][0]['message']['content']
    usage = response_json['usage']
    citations = response_json.get('citations', [])
    
    return content, usage, citations

# perplexity_response, perplexity_tokens_usage, perplexity_citations  = perplexity_call(
#     "List all Study Languages or if there combined languages, at Istanbul Medipol University"
# )
# print("===== PERPLEXITY RESPONSE =====")
# print(perplexity_response)
# print("===========================")

# print("===== TOKEN USAGE =====")
# print(perplexity_tokens_usage)

# print(f"Call price: ${calc_sonar_pro_cost(usage):.4f}")
# print("===========================")

# print("===== CITATIONS =====")
# for citation in perplexity_citations:
#     print(citation)
# print("===========================")

### the query for single call for test











In [None]:
perplexity_query = f""" get me all information about (all location , return all loction titles with its address)  for "medipol.edu.tr" university
only search in this domain
"""

perplexity_query = f""" get me all information about (phone numbers)  for "medipol.edu.tr" university
only search in this domain
"""
perplexity_query = f""" get me the information about (About Us information)  for "medipol.edu.tr" university
only search in this domain
"""

perplexity_query = f""" for "medipol.edu.tr" university
search only in domain ,summary the basic vision of the university in general. not for specific departments but for university in general.
"""

perplexity_query = f""" for "medipol.edu.tr" university
search only in domain ,summary the basic Mission of the university in general. not for specific departments but for university in general.
"""


perplexity_query = f""" get me all information about Partnerships for "medipol.edu.tr" university
only search in this domain
"""

perplexity_query = f"""what is the Awards and Honors for "medipol.edu.tr" university
only search in this domain
"""

perplexity_query = f""" for "isikun.edu.tr" university
search only in domain ,copy for me the all google maps URLS for the university.if not found return "no information found"
"""

perplexity_query = f"""List all faculties and departments at Istanbul Medipol University with their official pages, search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all Study Languages or if there combined languages, at Istanbul Medipol University, search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""get me numbers of (Total Students , Undergraduate Students , Graduate Students , Local Students , International Students , Total Alumni),
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""what is the Accreditation for "medipol.edu.tr" university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""what is the Blocked Nationalities for "medipol.edu.tr" university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""what is the University Motto for "medipol.edu.tr" university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""what is the Times Higher Education Ranking   for "medipol.edu.tr" university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""what is the Year Established for "medipol.edu.tr" university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""what is the Admission Requirements for "medipol.edu.tr" university,
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" what is the Scholarships and Financial Aids , list me all scholarships and financial aids the universoty provides for medipol university, 
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" what is the Discounts provides by medipol university for the students, list me it all 
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" Who are the Notable Alumni of medipol university, list me the all of them , If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" Who are the Partner Institutions of medipol university, list me the all of them , If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""list me all information about Industry Collaborations of medipol university, list me the all of them , If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" Who are the Partner Institutions of medipol university, list me the all of them , If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" what is the Student Satisfaction Ratings and Graduate Employment Rates  for medipol university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" what is the Early Admission Policy for medipol university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" what is the Deferred Admission Policy for medipol university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" what is the Transfer Student Policy for medipol university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" what is the Application Process Steps for medipol university, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

# perplexity_query = f""" what is the Tuition Currency and Official Tuition and Discounted Tuition for medipol university,  in general for all programs only for local students
# search only in domain "medipol.edu.tr"
# """

# perplexity_query = f""" what is the Tuition Currency and Official Tuition and Discounted Tuition for medipol university,  in general for all programs only for international students
# search only in domain "medipol.edu.tr"
# """

# perplexity_query = f""" what is the deposit required for medipol university if there specific deposit for each degree and with what amount and with what currency,  in general for all programs 
# search only in domain "medipol.edu.tr"
# """

# perplexity_query = f""" what is the application fee required for medipol university if there specific application fee for each degree and with what amount and with what currency,  in general for all programs 
# search only in domain "medipol.edu.tr"
# """

perplexity_query = f""" what is the contact information retreive ( contacts names , Emails,  organization phone numbers) for medipol university, 
search only in domain "medipol.edu.tr"
"""

perplexity_query = f""" what is the social media accounts with its links for medipol university, for (Facebook, X , Youtube , Insagram , Linkdin , Tiktok , Vk)
"""

perplexity_query = f"""List all information about (Library Information), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Research Facilities), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Housing Options), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Dining Services), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Sports Facilities), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
""" 

perplexity_query = f"""List all information about (Student Organizations), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""             

perplexity_query = f"""List all information about (Health Services), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Career Counseling Services), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Disability Services), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""         

perplexity_query = f"""List all information about (Study Abroad and Exchange Students Details), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Campus Security Services), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
""" 

perplexity_query = f"""List all information about (Technology Resources), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""     

perplexity_query = f"""List all information about (Transportation Services), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Cultural Centers), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Recreation Facilities), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""     

perplexity_query = f"""List all information about (Childcare Services), at Istanbul Medipol University, Childcare Services mean seerivces for children of students not programs, if not fount return  " no information found"
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Financial Services), at Istanbul Medipol University, Financial Services mean services for students to manage their finances,not programs, if not fount return  " no information found"
search only in domain "medipol.edu.tr"
"""

perplexity_query = f"""List all information about (Religious/Spiritual Services), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""         

perplexity_query = f"""List all information about (Student Support Services), at Istanbul Medipol University, If not found, return "no information found
search only in domain "medipol.edu.tr"
"""



Library Information
Research Facilities
Housing Options
Dining Services
Sports Facilities
Student Organizations
Health Services
Career Counseling Services
Disability Services
Study Abroad and Exchange Students Details
Campus Security Services
Technology Resources
Transportation Services
Cultural Centers
Recreation Facilities
Childcare Services
Financial Services
Religious/Spiritual Services
Student Support Services



## Run single query

In [23]:

# def perplexity_call(perplexity_query: str,perplexity_model: str = "sonar-reasoning-pro", temperature: float = 0.01) -> tuple:
#     """Calls the Perplexity API with a specific query and domain filter."""
#     if not PERPLEXITY_API_KEY:
#         print("API Error: PERPLEXITY_API_KEY environment variable not set.")
#         return "", {}, []
    
#     url = "https://api.perplexity.ai/chat/completions"
#     headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}", "Content-Type": "application/json"}
#     payload = {
#         "model": perplexity_model,
#         "messages": [
#             {"role": "system", "content": "You are a specialized web crawler and information extractor. Focus exclusively on the provided domain. Answer the user's query concisely and accurately based on the website's content. If information is not found, clearly state that."},
#             {"role": "user", "content": perplexity_query}
#         ],
#         "temperature": temperature,
#         "search_domain_filter": ["itea.edu"],
#         "return_citations": True
#     }
    
#     try:
#         response = requests.post(url, headers=headers, json=payload, timeout=120)
#         response.raise_for_status()
#     except requests.exceptions.RequestException as e:
#         print(f"API Request Error: {e}")
#         return "", {}, []
    
#     response_json = response.json()
#     content = response_json['choices'][0]['message']['content']
#     usage = response_json.get('usage', {})
#     citations = response_json.get('citations', [])
    
#     return content, usage, citations


perplexity_query = f""" 
get me all information about (phone numbers) for "medipol.edu.tr " university, only search in this domain"
"""


print("===== PERPLEXITY QUERY =====")
print(perplexity_query)
print("===========================")
# Call with emails_only=True to ensure we only get emails
perplexity_response, perplexity_tokens_usage, perplexity_citations = perplexity_call(
    perplexity_query=perplexity_query
)

print("===== PERPLEXITY RESPONSE =====")
print(perplexity_response)
print("===========================")

print("===== TOKEN USAGE =====")
print(perplexity_tokens_usage)

print(f"Call price: ${call_sonar_reasoning_pro_cost(perplexity_tokens_usage):.4f}")
print("===========================")

print("===== CITATIONS =====")
for citation in perplexity_citations:
    print(citation)
print("===========================")


===== PERPLEXITY QUERY =====
 
get me all information about (phone numbers) for "medipol.edu.tr " university, only search in this domain"

===== PERPLEXITY RESPONSE =====
<think>
We are given a query: "get me all information about (phone numbers) for 'medipol.edu.tr' university, only search in this domain"

We have several search results from the domain medipol.edu.tr. We need to extract all phone numbers mentioned in these search results.

Let's go through each search result:

[1] https://www.medipol.edu.tr/en/about-medipol/contact-us
- Lists:
  - akademik@medipol.edu.tr · 444 85 44 - 4800
  - Administrative Units, idari@medipol.edu.tr · 444 85 44 - 4800
  - Students Affairs Office, ogrenciisleri@medipol.edu.tr ... (no phone number in the snippet)

[2] https://mio.medipol.edu.tr/contact-units-advisors-at-campus
- Telephone: 444 85 44 -Extension number :5360 / 1853
- Also lists:
  - Telephone: 444 85 44 -Extension number :5360 / 1853
  - WhatsApp: +90 531 882 1331

[3] https://mio.medi

: 

## Main Function - save txt file

In [74]:
import requests
import os
import re
import time

# --- Configuration ---
# IMPORTANT: Set your API key as an environment variable for security.
# In your terminal (before starting Jupyter): export PERPLEXITY_API_KEY='your_api_key_here'
# Or, for testing in a notebook (less secure), you can uncomment the line below:
# os.environ['PERPLEXITY_API_KEY'] = 'pplx-...' 

PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")

# Placeholder for the university domain
TARGET_DOMAINS = [ "akdeniz.edu.tr" , "khas.edu.tr" , "biruni.edu.tr", "beykent.edu.tr" ,"lokmanhekim.edu.tr" , "okan.edu.tr" ,"mendelu.cz", "isikun.edu.tr"] #"medipol.edu.tr" ,

# Output file name
OUTPUT_FILENAME = f"{TARGET_DOMAINS}_data.txt"

def generate_queries_for_domain(domain: str) -> list:
    """Generates a list of query dictionaries for the given domain."""
    # This function now holds the query templates.
    query_templates = [
        {'title': 'About Us', 'query': f'get me the information about (About Us information) for {domain} university, only search in this domain'},
        {'title': 'Vision', 'query': f'for {domain} university, search only in domain, summary the basic vision of the university in general. not for specific departments but for university in general'},
        {'title': 'Mission', 'query': f'for {domain} university, search only in domain, summary the basic Mission of the university in general. not for specific departments but for university in general'},
        {'title': 'Locations and Addresses', 'query': f'get me all information about (all location, return all location titles with its address) for {domain} university, only search in this domain'},
        {'title': 'Google Maps URLS', 'query': f'get me all information about (google maps URLS) for {domain} university, only search in this domain, If not found return "no information found"'},
        {'title': 'Phone Numbers', 'query': f'get me all information about (phone numbers) for {domain} university, only search in this domain'},
        {'title': 'Partnerships', 'query': f'get me all information about Partnerships for {domain} university, only search in this domain'},
        {'title': 'Awards and Honors', 'query': f'what is the Awards and Honors for {domain} university, only search in this domain'},
        {'title': 'Faculties and Departments', 'query': f'List all faculties and departments at {domain} with their official pages, search only in domain'},
        {'title': 'Study Languages', 'query': f'List all Study Languages or if there combined languages at {domain}, search only in domain'},
        {'title': 'Student Statistics', 'query': f'get me numbers of (Total Students, Undergraduate Students, Graduate Students, Local Students, International Students, Total Alumni) for {domain}, search only in domain'},
        {'title': 'Accreditation', 'query': f'what is the Accreditation for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'Blocked Nationalities', 'query': f'what is the Blocked Nationalities for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'University Motto', 'query': f'what is the University Motto for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'THE Ranking', 'query': f'what is the Times Higher Education Ranking for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'Year Established', 'query': f'what is the Year Established for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'Admission Requirements', 'query': f'what is the Admission Requirements for {domain} university, search only in domain'},
        {'title': 'Scholarships', 'query': f'what is the Scholarships and Financial Aids, list all scholarships and financial aids provided by {domain}, search only in domain'},
        {'title': 'Student Discounts', 'query': f'what are the Discounts provided by {domain} for students, list all, search only in domain'},
        {'title': 'Notable Alumni', 'query': f'Who are the Notable Alumni of {domain}, list all, If not found return "no information found", search only in domain'},
        {'title': 'Partner Institutions', 'query': f'Who are the Partner Institutions of {domain}, list all, If not found return "no information found", search only in domain'},
        {'title': 'Industry Collaborations', 'query': f'list all information about Industry Collaborations of {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Student Satisfaction', 'query': f'what is the Student Satisfaction Ratings and Graduate Employment Rates for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Early Admission Policy', 'query': f'what is the Early Admission Policy for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Deferred Admission Policy', 'query': f'what is the Deferred Admission Policy for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Transfer Student Policy', 'query': f'what is the Transfer Student Policy for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Application Process', 'query': f'what is the Application Process Steps for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Contact Information', 'query': f'what is the contact information (contacts names, Emails, organization phone numbers) for {domain}, search only in domain'},
        {'title': 'Social Media', 'query': f'what are the social media accounts with links for {domain} (Facebook, X, Youtube, Instagram, Linkedin, Tiktok, Vk), search only in domain'},
        {'title': 'Library Information', 'query': f'List all information about Library Information at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Research Facilities', 'query': f'List all information about Research Facilities at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Housing Options', 'query': f'List all information about Housing Options at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Dining Services', 'query': f'List all information about Dining Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Sports Facilities', 'query': f'List all information about Sports Facilities at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Student Organizations', 'query': f'List all information about Student Organizations at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Health Services', 'query': f'List all information about Health Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Career Counseling', 'query': f'List all information about Career Counseling Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Disability Services', 'query': f'List all information about Disability Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Study Abroad Programs', 'query': f'List all information about Study Abroad and Exchange Students Details at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Campus Security', 'query': f'List all information about Campus Security Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Technology Resources', 'query': f'List all information about Technology Resources at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Transportation Services', 'query': f'List all information about Transportation Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Cultural Centers', 'query': f'List all information about Cultural Centers at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Recreation Facilities', 'query': f'List all information about Recreation Facilities at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Childcare Services', 'query': f'List all information about Childcare Services (services for children of students) at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Financial Services', 'query': f'List all information about Financial Services (services for students to manage finances) at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Religious Services', 'query': f'List all information about Religious/Spiritual Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Student Support Services', 'query': f'List all information about Student Support Services at {domain}, If not found return "no information found", search only in domain'}
    ]
    
    return [{'title': qt['title'], 'query': qt['query'].format(domain=domain)} for qt in query_templates]


def clean_citation_references(text: str) -> str:
    """
    Remove citation references like [1], [2], [3], [^1^], [^ranking^], etc. from the text.
    Remove <think> block .
    """
    # Pattern to match citation references: [number] or [number][number] etc.
    cleaned_text = re.sub(r'\[\d+\](?:\[\d+\])*', '', text)
    # Pattern to match citation references: [^number^] or [^text^]
    cleaned_text = re.sub(r'\[\^[^\]]+\^\]', '', cleaned_text)
    
    cleaned_text = re.sub(r'<think>.*?</think>', '', cleaned_text, flags=re.DOTALL).strip()
    
    return cleaned_text


def perplexity_call(perplexity_query: str, domain: str, perplexity_model: str = "sonar-reasoning-pro", temperature: float = 0.01) -> tuple:
    """Calls the Perplexity API with a specific query and domain filter."""
    if not PERPLEXITY_API_KEY:
        print("API Error: PERPLEXITY_API_KEY environment variable not set.")
        return "", {}, []
    url = "https://api.perplexity.ai/chat/completions"
    headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}", "Content-Type": "application/json"}
    payload = {
        "model": perplexity_model,
        "messages": [
            {"role": "system", "content": "You are a specialized web crawler and information extractor. Focus exclusively on the provided domain. Answer the user's query concisely and accurately based on the website's content. If information is not found, clearly state that."},
            {"role": "user", "content": perplexity_query}
        ],
        "temperature": temperature,
        "search_domain_filter": [domain],
        "return_citations": True
    }
    try:
        response = requests.post(url, headers=headers, json=payload, timeout=120)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"API Request Error: {e}")
        return "", {}, []
    response_json = response.json()
    content = response_json['choices'][0]['message']['content']
    usage = response_json.get('usage', {})
    citations = response_json.get('citations', [])
    return content, usage, citations

def main():
    """Main function to orchestrate the processing of all domains."""
    if not PERPLEXITY_API_KEY:
        print("FATAL: PERPLEXITY_API_KEY is not set. Exiting.")
        return

    # Loop through each domain in the list
    for domain in TARGET_DOMAINS:
        print("\n" + "="*80)
        print(f"STARTING PROCESSING FOR DOMAIN: {domain}")
        print("="*80)

        # 1. Generate dynamic filename and queries for the current domain
        output_filename = f"{domain}_data.txt"
        queries = generate_queries_for_domain(domain)
        
        # 2. Initialize cost tracker for the current domain
        domain_total_cost = 0.0
        
        with open(output_filename, 'w', encoding='utf-8') as f:
            total_queries = len(queries)
            for i, item in enumerate(queries):
                title = item['title']
                query = item['query']
                
                print(f"\n[{i+1}/{total_queries}] Processing '{title}' for {domain}...")
                
                response_content, usage, citations = perplexity_call(
                    perplexity_query=query,
                    domain=domain
                )
                
                if usage:
                    domain_total_cost += calc_sonar_pro_cost(usage)
                
                if response_content:
                    response_content = clean_citation_references(response_content)

                f.write(f"## {title}\n\n")
                if response_content:
                    f.write(f"{response_content}\n\n")
                    if citations:
                        f.write("Citations:\n")
                        for cit in citations:
                            f.write(f"- {cit}\n")
                    else:
                        f.write("No citations provided.\n")
                else:
                    f.write("Failed to retrieve information for this query.\n")
                
                f.write("\n" + "-"*80 + "\n\n")
                print(f"Finished: '{title}'. Data saved.")
                time.sleep(2) # Be polite to the API

        # 3. Print summary for the completed domain
        print("\n" + "-"*80)
        print(f"COMPLETED PROCESSING FOR DOMAIN: {domain}")
        print(f"Report saved to: {output_filename}")
        print(f"Total Estimated Cost for this domain: ${domain_total_cost:.4f}")
        print("-"*80)

    print("\n\nALL DOMAINS HAVE BEEN PROCESSED.")

# Run the main function
if __name__ == "__main__":
    main()


STARTING PROCESSING FOR DOMAIN: pacificcollege.edu

[1/48] Processing 'About Us' for pacificcollege.edu...
Finished: 'About Us'. Data saved.

[2/48] Processing 'Vision' for pacificcollege.edu...
Finished: 'Vision'. Data saved.

[3/48] Processing 'Mission' for pacificcollege.edu...
Finished: 'Mission'. Data saved.

[4/48] Processing 'Locations and Addresses' for pacificcollege.edu...
Finished: 'Locations and Addresses'. Data saved.

[5/48] Processing 'Google Maps URLS' for pacificcollege.edu...
Finished: 'Google Maps URLS'. Data saved.

[6/48] Processing 'Phone Numbers' for pacificcollege.edu...
Finished: 'Phone Numbers'. Data saved.

[7/48] Processing 'Partnerships' for pacificcollege.edu...
Finished: 'Partnerships'. Data saved.

[8/48] Processing 'Awards and Honors' for pacificcollege.edu...
Finished: 'Awards and Honors'. Data saved.

[9/48] Processing 'Faculties and Departments' for pacificcollege.edu...
Finished: 'Faculties and Departments'. Data saved.

[10/48] Processing 'Study L

## Main Function - save excel file

In [10]:
import requests
import os
import re
import time
import pandas as pd

# --- Configuration ---
# IMPORTANT: Set your API key as an environment variable for security.
# In your terminal (before starting Jupyter): export PERPLEXITY_API_KEY='your_api_key_here'
# Or, for testing in a notebook (less secure), you can uncomment the line below:
# os.environ['PERPLEXITY_API_KEY'] = 'pplx-...' 

PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")

# Placeholder for the university domain
TARGET_DOMAINS = ["bryantstratton.edu","pacificcollege.edu","akdeniz.edu.tr" , "khas.edu.tr" , "biruni.edu.tr", "beykent.edu.tr" ,"lokmanhekim.edu.tr" , "okan.edu.tr" ,"mendelu.cz", "isikun.edu.tr"] # "medipol.edu.tr","itea.edu",

def generate_queries_for_domain(domain: str) -> list:
    """Generates a list of query dictionaries for the given domain."""
    # This function now holds the query templates.
    query_templates = [
        {'title': 'About Us', 'query': f'get me the information about (About Us information) for {domain} university, only search in this domain'},
        {'title': 'Vision', 'query': f'for {domain} university, search only in domain, summary the basic vision of the university in general. not for specific departments but for university in general'},
        {'title': 'Mission', 'query': f'for {domain} university, search only in domain, summary the basic Mission of the university in general. not for specific departments but for university in general'},
        {'title': 'Locations and Addresses', 'query': f'get me all information about (all location, return all location titles with its address) for {domain} university, only search in this domain'},
        {'title': 'Google Maps URLS', 'query': f'get me all information about (google maps URLS) for {domain} university, only search in this domain, If not found return "no information found"'},
        {'title': 'Phone Numbers', 'query': f'get me all information about (phone numbers) for {domain} university, only search in this domain'},
        {'title': 'Partnerships', 'query': f'get me all information about Partnerships for {domain} university, only search in this domain'},
        {'title': 'Awards and Honors', 'query': f'what is the Awards and Honors for {domain} university, only search in this domain'},
        {'title': 'Faculties and Departments', 'query': f'List all faculties and departments at {domain} with their official pages, search only in domain'},
        {'title': 'Study Languages', 'query': f'List all Study Languages or if there combined languages at {domain}, search only in domain'},
        {'title': 'Student Statistics', 'query': f'get me numbers of (Total Students, Undergraduate Students, Graduate Students, Local Students, International Students, Total Alumni) for {domain}, search only in domain'},
        {'title': 'Accreditation', 'query': f'what is the Accreditation for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'Blocked Nationalities', 'query': f'what is the Blocked Nationalities for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'University Motto', 'query': f'what is the University Motto for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'THE Ranking', 'query': f'what is the Times Higher Education Ranking for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'Year Established', 'query': f'what is the Year Established for {domain} university, If not found return "no information found", search only in domain'},
        {'title': 'Admission Requirements', 'query': f'what is the Admission Requirements for {domain} university, search only in domain'},
        {'title': 'Scholarships', 'query': f'what is the Scholarships and Financial Aids, list all scholarships and financial aids provided by {domain}, search only in domain'},
        {'title': 'Student Discounts', 'query': f'what are the Discounts provided by {domain} for students, list all, search only in domain'},
        {'title': 'Notable Alumni', 'query': f'Who are the Notable Alumni of {domain}, list all, If not found return "no information found", search only in domain'},
        {'title': 'Partner Institutions', 'query': f'Who are the Partner Institutions of {domain}, list all, If not found return "no information found", search only in domain'},
        {'title': 'Industry Collaborations', 'query': f'list all information about Industry Collaborations of {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Student Satisfaction', 'query': f'what is the Student Satisfaction Ratings and Graduate Employment Rates for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Early Admission Policy', 'query': f'what is the Early Admission Policy for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Deferred Admission Policy', 'query': f'what is the Deferred Admission Policy for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Transfer Student Policy', 'query': f'what is the Transfer Student Policy for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Application Process', 'query': f'what is the Application Process Steps for {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Contact Information', 'query': f'what is the contact information (contacts names, Emails, organization phone numbers) for {domain}, search only in domain'},
        {'title': 'Social Media', 'query': f'what are the social media accounts with links for {domain} (Facebook, X, Youtube, Instagram, Linkedin, Tiktok, Vk), search only in domain'},
        {'title': 'Library Information', 'query': f'List all information about Library Information at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Research Facilities', 'query': f'List all information about Research Facilities at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Housing Options', 'query': f'List all information about Housing Options at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Dining Services', 'query': f'List all information about Dining Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Sports Facilities', 'query': f'List all information about Sports Facilities at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Student Organizations', 'query': f'List all information about Student Organizations at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Health Services', 'query': f'List all information about Health Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Career Counseling', 'query': f'List all information about Career Counseling Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Disability Services', 'query': f'List all information about Disability Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Study Abroad Programs', 'query': f'List all information about Study Abroad and Exchange Students Details at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Campus Security', 'query': f'List all information about Campus Security Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Technology Resources', 'query': f'List all information about Technology Resources at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Transportation Services', 'query': f'List all information about Transportation Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Cultural Centers', 'query': f'List all information about Cultural Centers at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Recreation Facilities', 'query': f'List all information about Recreation Facilities at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Childcare Services', 'query': f'List all information about Childcare Services (services for children of students) at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Financial Services', 'query': f'List all information about Financial Services (services for students to manage finances) at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Religious Services', 'query': f'List all information about Religious/Spiritual Services at {domain}, If not found return "no information found", search only in domain'},
        {'title': 'Student Support Services', 'query': f'List all information about Student Support Services at {domain}, If not found return "no information found", search only in domain'}
    ]
    
    return [{'title': qt['title'], 'query': qt['query'].format(domain=domain)} for qt in query_templates]


def clean_citation_references(text: str) -> str:
    """
    Remove citation references like [1], [2], [3], [^1^], [^ranking^], etc. from the text.
    Remove <think> block .
    """
    # Pattern to match citation references: [number] or [number][number] etc.
    cleaned_text = re.sub(r'\[\d+\](?:\[\d+\])*', '', text)
    # Pattern to match citation references: [^number^] or [^text^]
    cleaned_text = re.sub(r'\[\^[^\]]+\^\]', '', cleaned_text)
    
    cleaned_text = re.sub(r'<think>.*?</think>', '', cleaned_text, flags=re.DOTALL).strip()
    
    return cleaned_text


def format_citations(citations: list) -> str:
    """Format citations list into a single string."""
    if not citations:
        return "No citations provided"
    
    formatted_citations = []
    for i, citation in enumerate(citations, 1):
        formatted_citations.append(f"{i}. {citation}")
    
    return "\n".join(formatted_citations)


def calc_sonar_pro_cost(meta: dict) -> float:
    """
    Return the USD cost for one Sonar Pro API call.

    Parameters
    ----------
    meta : dict
        A usage-metadata dictionary like the one Perplexity returns, e.g.
        {
            'completion_tokens': 694,
            'prompt_tokens': 37,
            'total_tokens': 731,
            'search_context_size': 'low'
        }

    Returns
    -------
    float
        Cost in US dollars (rounded to 6 decimals).
    """
    
    PRICING = {
    "low":    {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 6 / 1000},
    "medium": {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 10 / 1000},
    "high":   {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 14 / 1000},
    }
    
    
    ctx = meta.get("search_context_size", "low")
    p = PRICING[ctx]

    # convert per-million rates to per-token
    input_rate  = p["input_per_million"]  / 1_000_000
    output_rate = p["output_per_million"] / 1_000_000

    prompt_tokens     = meta.get("prompt_tokens", 0)
    completion_tokens = meta.get("completion_tokens", 0)

    token_cost   = prompt_tokens * input_rate + completion_tokens * output_rate
    request_cost = p["per_request"]

    return round(token_cost + request_cost, 6)


def perplexity_call(perplexity_query: str, domain: str, perplexity_model: str = "sonar-reasoning-pro", temperature: float = 0.01) -> tuple:
    """Calls the Perplexity API with a specific query and domain filter."""
    if not PERPLEXITY_API_KEY:
        print("API Error: PERPLEXITY_API_KEY environment variable not set.")
        return "", {}, []
    
    url = "https://api.perplexity.ai/chat/completions"
    headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}", "Content-Type": "application/json"}
    payload = {
        "model": perplexity_model,
        "messages": [
            {"role": "system", "content": "You are a specialized web crawler and information extractor. Focus exclusively on the provided domain. Answer the user's query concisely and accurately based on the website's content. If information is not found, clearly state that."},
            {"role": "user", "content": perplexity_query}
        ],
        "temperature": temperature,
        "search_domain_filter": [domain],
        "return_citations": True
    }
    
    try:
        response = requests.post(url, headers=headers, json=payload, timeout=120)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"API Request Error: {e}")
        return "", {}, []
    
    response_json = response.json()
    content = response_json['choices'][0]['message']['content']
    usage = response_json.get('usage', {})
    citations = response_json.get('citations', [])
    
    return content, usage, citations


def main():
    """Main function to orchestrate the processing of all domains."""
    if not PERPLEXITY_API_KEY:
        print("FATAL: PERPLEXITY_API_KEY is not set. Exiting.")
        return

    # Loop through each domain in the list
    for domain in TARGET_DOMAINS:
        print("\n" + "="*80)
        print(f"STARTING PROCESSING FOR DOMAIN: {domain}")
        print("="*80)

        # 1. Generate dynamic filename and queries for the current domain
        output_filename = f"{domain}_data.xlsx"
        queries = generate_queries_for_domain(domain)
        
        # 2. Initialize cost tracker and data storage for the current domain
        domain_total_cost = 0.0
        data_rows = []
        
        total_queries = len(queries)
        for i, item in enumerate(queries):
            title = item['title']
            query = item['query']
            
            print(f"\n[{i+1}/{total_queries}] Processing '{title}' for {domain}...")
            
            response_content, usage, citations = perplexity_call(
                perplexity_query=query,
                domain=domain
            )
            
            if usage:
                domain_total_cost += calc_sonar_pro_cost(usage)
            
            if response_content:
                response_content = clean_citation_references(response_content)
            else:
                response_content = "Failed to retrieve information for this query."
            
            # Format citations for Excel
            formatted_citations = format_citations(citations)
            
            # Add row to data
            data_rows.append({
                'Title': title,
                'Response Content': response_content,
                'Citations': formatted_citations
            })
            
            print(f"Finished: '{title}'. Data collected.")
            time.sleep(2)  # Be polite to the API

        # 3. Create DataFrame and save to Excel
        df = pd.DataFrame(data_rows)
        
        # Save to Excel with formatting
        with pd.ExcelWriter(output_filename, engine='openpyxl') as writer:
            df.to_excel(writer, sheet_name=f'{domain}_data', index=False)
            
            # Get the workbook and worksheet for formatting
            workbook = writer.book
            worksheet = writer.sheets[f'{domain}_data']
            
            # Auto-adjust column widths
            for column in worksheet.columns:
                max_length = 0
                column_letter = column[0].column_letter
                
                for cell in column:
                    try:
                        if len(str(cell.value)) > max_length:
                            max_length = len(str(cell.value))
                    except:
                        pass
                
                # Set a reasonable maximum width
                adjusted_width = min(max_length + 2, 50)
                worksheet.column_dimensions[column_letter].width = adjusted_width
            
            # Enable text wrapping for all cells
            from openpyxl.styles import Alignment
            for row in worksheet.iter_rows():
                for cell in row:
                    cell.alignment = Alignment(wrap_text=True, vertical='top')

        # 4. Print summary for the completed domain
        print("\n" + "-"*80)
        print(f"COMPLETED PROCESSING FOR DOMAIN: {domain}")
        print(f"Excel report saved to: {output_filename}")
        print(f"Total rows: {len(data_rows)}")
        print(f"Total Estimated Cost for this domain: ${domain_total_cost:.4f}")
        print("-"*80)

    print("\n\nALL DOMAINS HAVE BEEN PROCESSED.")


# Run the main function
if __name__ == "__main__":
    main()


STARTING PROCESSING FOR DOMAIN: bryantstratton.edu

[1/48] Processing 'About Us' for bryantstratton.edu...
Finished: 'About Us'. Data collected.

[2/48] Processing 'Vision' for bryantstratton.edu...
Finished: 'Vision'. Data collected.

[3/48] Processing 'Mission' for bryantstratton.edu...
Finished: 'Mission'. Data collected.

[4/48] Processing 'Locations and Addresses' for bryantstratton.edu...
Finished: 'Locations and Addresses'. Data collected.

[5/48] Processing 'Google Maps URLS' for bryantstratton.edu...
Finished: 'Google Maps URLS'. Data collected.

[6/48] Processing 'Phone Numbers' for bryantstratton.edu...
Finished: 'Phone Numbers'. Data collected.

[7/48] Processing 'Partnerships' for bryantstratton.edu...
Finished: 'Partnerships'. Data collected.

[8/48] Processing 'Awards and Honors' for bryantstratton.edu...
Finished: 'Awards and Honors'. Data collected.

[9/48] Processing 'Faculties and Departments' for bryantstratton.edu...
Finished: 'Faculties and Departments'. Data col

# extract programs information from university website











## queries for programs information











## run single query

In [9]:
import requests

def perplexity_call(perplexity_query: str,
                    perplexity_model: str = "sonar-reasoning-pro",
                    temperature: float = 0.01) -> tuple:
    
    url = "https://api.perplexity.ai/chat/completions"
    headers = {
        "Authorization": f"Bearer {PERPLEXITY_API_KEY}",
        "Content-Type": "application/json"
    }
    
    payload = {
        "model": perplexity_model,
        "messages": [{
            "role": "system",
            "content": "You are a specialized web crawler focused on university domains"
        }, {
            "role": "user",
            "content": perplexity_query
        }],
        "temperature": temperature,
        "search_domain_filter": ["iona.edu"],
        "return_citations": True,
        "web_search_options": {
            "search_context_size": "high"  # This is the key addition
        }
    }
    
    response = requests.post(url, headers=headers, json=payload)
    
    if response.status_code != 200:
        print(f"API Error: {response.status_code}")
        print(response.text)
        return "", {}, []
    
    response_json = response.json()
    
    content = response_json['choices'][0]['message']['content']
    usage = response_json['usage']
    citations = response_json.get('citations', [])
    
    return content, usage, citations





perplexity_query = f""" for iona.edu website listing programs for all degrees , search in all resources for any information about programs avalibles, search only in domain
"""


print("===== PERPLEXITY QUERY =====")
print(perplexity_query)
print("===========================")
# Call with emails_only=True to ensure we only get emails
perplexity_response, perplexity_tokens_usage, perplexity_citations = perplexity_call(
    perplexity_query=perplexity_query
)

print("===== PERPLEXITY RESPONSE =====")
print(perplexity_response)
print("===========================")

print("===== TOKEN USAGE =====")
print(perplexity_tokens_usage)

print(f"Call price: ${call_sonar_reasoning_pro_cost(perplexity_tokens_usage):.4f}")
print("===========================")

print("===== CITATIONS =====")
for citation in perplexity_citations:
    print(citation)
print("===========================")

===== PERPLEXITY QUERY =====
 for iona.edu website listing programs for all degrees , search in all resources for any information about programs avalibles, search only in domain

===== PERPLEXITY RESPONSE =====
<think>
We are given a query: "for iona.edu website listing programs for all degrees , search in all resources for any information about programs avalibles, search only in domain"

The user is asking for a listing of all degree programs available at Iona University, specifically from the iona.edu domain.

We have multiple search results from the iona.edu domain that provide information about various programs. We need to compile a comprehensive list of all degree programs (both undergraduate and graduate) from these sources.

Let's analyze the search results:

[1] Lists graduate programs at Iona in:
- Allied Health Sciences and Nursing (with specific degrees: MA in Communication Sciences and Disorders, MS in Healthcare Leadership & Innovation, MS in Nursing Healthcare Organizatio

# Main function - with dublicated urls

In [97]:
import requests
import os
import re
import time

# --- Configuration ---
# IMPORTANT: Set your API key as an environment variable for security.
# In your terminal (before starting Jupyter): export PERPLEXITY_API_KEY='your_api_key_here'
# Or, for testing in a notebook (less secure), you can uncomment the line below:
# os.environ['PERPLEXITY_API_KEY'] = 'pplx-...' 

PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")

# Placeholder for the university domain
TARGET_DOMAINS = [ "yeniyuzyil.edu.tr"] #"medipol.edu.tr" ,

# Output file name
# OUTPUT_FILENAME = f"{TARGET_DOMAINS}_data.txt"

def generate_queries_for_domain(domain: str) -> list:
    """Generates a list of query dictionaries for the given domain."""
    # This function now holds the query templates.
    query_templates = [
        {
            'title': 'Programs Names',
            'query': f'Return 10 resources from {TARGET_DOMAINS} website listing programs for all degrees (Bachelor, Master, PhD, etc.), search only in domain'
        },
        {
            'title': 'Local Students - Tuition Fees',
            'query': f'Return 10 resources from {TARGET_DOMAINS} website about tuition fees for local students across all programs, search only in domain'
        },
        {
            'title': 'Local Students - Application Fees and Deposit Fees',
            'query': f'What are the application fees and deposit fees for local students at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'International Students - Tuition Fees',
            'query': f'Return 10 resources from {TARGET_DOMAINS} website about tuition fees for international students across all programs, search only in domain'
        },
        # {
        #     'title': 'International Students - Application Fees and Deposit Fees',
        #     'query': f'What are the application fees and deposit fees for international students at {TARGET_DOMAINS} university, search only in domain'
        # },
        # {
        #     'title': 'Program Coordinators/Director Names, Phones and Emails',
        #     'query': f'Retrieve program coordinators/director names, phone numbers and emails at {TARGET_DOMAINS} university, search only in domain'
        # },
        # {
        #     'title': 'Program Requirements',
        #     'query': f'Are there any specific requirements for programs at {TARGET_DOMAINS} university? If not found return "no information found", search only in domain'
        # },
        # {
        #     'title': 'Program-specific Scholarships',
        #     'query': f'Are there scholarships/financial aids for specific programs at {TARGET_DOMAINS} university? If not found return "no information found", search only in domain'
        # },
        # {
        #     'title': 'Career Path and Courses',
        #     'query': f'Find career paths, core courses, and elective courses for every program at {TARGET_DOMAINS} university, search only in domain'
        # },
        # {
        #     'title': 'Program Discounts',
        #     'query': f'What discounts are provided for programs at {TARGET_DOMAINS} university, search only in domain'
        # },
        # {
        #     'title': 'Minimum GPA and Required Documents',
        #     'query': f'What are the minimum GPA requirements and required documents for specific programs at {TARGET_DOMAINS} university, search only in domain'
        # },
        # {
        #     'title': 'Interview Requirements and Application Process',
        #     'query': f'What are the interview requirements and application process steps for specific programs at {TARGET_DOMAINS} university, search only in domain'
        # },
        # {
        #     'title': 'Additional Exams and Minimum Scores',
        #     'query': f'Are additional exams required for specific programs at {TARGET_DOMAINS} university? If yes, list minimum scores per program, search only in domain'
        # }
    ]
    
    return [{'title': qt['title'], 'query': qt['query'].format(domain=domain)} for qt in query_templates]


def clean_citation_references(text: str) -> str:
    """
    Remove citation references like [1], [2], [3], [^1^], [^ranking^], etc. from the text.
    Remove <think> block .
    """
    # Pattern to match citation references: [number] or [number][number] etc.
    cleaned_text = re.sub(r'\[\d+\](?:\[\d+\])*', '', text)
    # Pattern to match citation references: [^number^] or [^text^]
    cleaned_text = re.sub(r'\[\^[^\]]+\^\]', '', cleaned_text)
    
    cleaned_text = re.sub(r'<think>.*?</think>', '', cleaned_text, flags=re.DOTALL).strip()
    
    return cleaned_text


def deduplicate_urls(citations: list) -> list:
    """
    Remove duplicate URLs from the citations list while preserving order.
    Also cleans up any malformed URLs.
    """
    seen_urls = set()
    unique_citations = []
    
    for citation in citations:
        # Clean up the URL - remove extra spaces and handle malformed entries
        clean_url = citation.strip()
        
        # Skip empty or very short entries
        if len(clean_url) < 10:
            continue
            
        # Ensure URL starts with http:// or https://
        if not clean_url.startswith(('http://', 'https://')):
            # Skip malformed URLs that don't start with proper protocol
            continue
        
        # Normalize URL by removing trailing slashes and converting to lowercase for comparison
        normalized_url = clean_url.rstrip('/').lower()
        
        # Only add if we haven't seen this URL before
        if normalized_url not in seen_urls:
            seen_urls.add(normalized_url)
            unique_citations.append(clean_url)  # Keep original case for output
    
    return unique_citations



def perplexity_call(perplexity_query: str, domain: str, perplexity_model: str = "sonar-reasoning-pro", temperature: float = 0.01) -> tuple:
    """Calls the Perplexity API with a specific query and domain filter."""
    if not PERPLEXITY_API_KEY:
        print("API Error: PERPLEXITY_API_KEY environment variable not set.")
        return "", {}, []
    url = "https://api.perplexity.ai/chat/completions"
    headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}", "Content-Type": "application/json"}
    payload = {
        "model": perplexity_model,
        "messages": [
            {"role": "system", "content": "You are a specialized web crawler and information extractor. Focus exclusively on the provided domain. Answer the user's query concisely and accurately based on the website's content. If information is not found, clearly state that."},
            {"role": "user", "content": perplexity_query}
        ],
        "temperature": temperature,
        "search_domain_filter": [domain],
        "return_citations": True,
        "web_search_options": {
            "search_context_size": "high"  # This is the key addition
        }
    }
    try:
        response = requests.post(url, headers=headers, json=payload, timeout=120)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"API Request Error: {e}")
        return "", {}, []
    response_json = response.json()
    content = response_json['choices'][0]['message']['content']
    usage = response_json.get('usage', {})
    citations = response_json.get('citations', [])
    return content, usage, citations




def main():
    """Main function to orchestrate the processing of all domains."""
    if not PERPLEXITY_API_KEY:
        print("FATAL: PERPLEXITY_API_KEY is not set. Exiting.")
        return

    # Loop through each domain in the list
    for domain in TARGET_DOMAINS:
        print("\n" + "="*80)
        print(f"STARTING PROCESSING FOR DOMAIN: {domain}")
        print("="*80)

        # 1. Generate dynamic filename and queries for the current domain
        output_filename = f"{domain}_resources.txt"
        queries = generate_queries_for_domain(domain)
        
        # 2. Initialize cost tracker for the current domain
        domain_total_cost = 0.0
        
        with open(output_filename, 'w', encoding='utf-8') as f:
            total_queries = len(queries)
            for i, item in enumerate(queries):
                title = item['title']
                query = item['query']
                
                print(f"\n[{i+1}/{total_queries}] Processing '{title}' for {domain}...")
                
                response_content, usage, citations = perplexity_call(
                    perplexity_query=query,
                    domain=domain
                )
                
                if usage:
                    domain_total_cost += calc_sonar_pro_cost(usage)
                
                # if response_content:
                #     response_content = clean_citation_references(response_content)

                f.write(f"## {title}\n\n")
                if response_content:
                    # f.write(f"{response_content}\n\n")
                    if citations:
                        f.write("Resources:\n")
                        for cit in citations:
                            f.write(f"- {cit}\n")
                    else:
                        f.write("No citations provided.\n")
                else:
                    f.write("Failed to retrieve information for this query.\n")
                
                f.write("\n" + "-"*80 + "\n\n")
                print(f"Finished: '{title}'. Data saved.")
                time.sleep(2) # Be polite to the API

        # 3. Print summary for the completed domain
        print("\n" + "-"*80)
        print(f"COMPLETED PROCESSING FOR DOMAIN: {domain}")
        print(f"Report saved to: {output_filename}")
        print(f"Total Estimated Cost for this domain: ${domain_total_cost:.4f}")
        print("-"*80)

    print("\n\nALL DOMAINS HAVE BEEN PROCESSED.")

# Run the main function
if __name__ == "__main__":
    main()


STARTING PROCESSING FOR DOMAIN: yeniyuzyil.edu.tr

[1/4] Processing 'Programs Names' for yeniyuzyil.edu.tr...
Finished: 'Programs Names'. Data saved.

[2/4] Processing 'Local Students - Tuition Fees' for yeniyuzyil.edu.tr...
Finished: 'Local Students - Tuition Fees'. Data saved.

[3/4] Processing 'Local Students - Application Fees and Deposit Fees' for yeniyuzyil.edu.tr...
Finished: 'Local Students - Application Fees and Deposit Fees'. Data saved.

[4/4] Processing 'International Students - Tuition Fees' for yeniyuzyil.edu.tr...
Finished: 'International Students - Tuition Fees'. Data saved.

--------------------------------------------------------------------------------
COMPLETED PROCESSING FOR DOMAIN: yeniyuzyil.edu.tr
Report saved to: yeniyuzyil.edu.tr_resources.txt
Total Estimated Cost for this domain: $0.2101
--------------------------------------------------------------------------------


ALL DOMAINS HAVE BEEN PROCESSED.


# Main function - NO dublicated urls

In [None]:
import requests
import os
import re
import time

# --- Configuration ---
# IMPORTANT: Set your API key as an environment variable for security.
# In your terminal (before starting Jupyter): export PERPLEXITY_API_KEY='your_api_key_here'
# Or, for testing in a notebook (less secure), you can uncomment the line below:
# os.environ['PERPLEXITY_API_KEY'] = 'pplx-...' 

PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")

# Placeholder for the university domain
TARGET_DOMAINS = ["toros.edu.tr"] #"medipol.edu.tr" ,

# Output file name
OUTPUT_FILENAME = f"{TARGET_DOMAINS}_data.txt"

def generate_queries_for_domain(domain: str) -> list:
    """Generates a list of query dictionaries for the given domain."""
    # This function now holds the query templates.
    query_templates = [
        {
            'title': 'Programs Names',
            'query': f'Return 10 resources from {TARGET_DOMAINS} website listing programs for all degrees (Bachelor, Master, PhD, etc.),and return any page contanin programs fees and tuition fees search only in domain'
        },
        {
            'title': 'Local Students - Tuition Fees',
            'query': f'Return 10 resources from {TARGET_DOMAINS} website about tuition fees for local students across all programs, search only in domain'
        },
        {
            'title': 'Local Students - Application Fees and Deposit Fees',
            'query': f'What are the application fees and deposit fees for local students at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'International Students - Tuition Fees',
            'query': f'Return 10 resources from {TARGET_DOMAINS} website about tuition fees for international students across all programs, search only in domain'
        },
        {
            'title': 'International Students - Application Fees and Deposit Fees',
            'query': f'What are the application fees and deposit fees for international students at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Program Coordinators/Director Names, Phones and Emails',
            'query': f'Retrieve program coordinators/director names, phone numbers and emails at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Program Requirements',
            'query': f'Are there any specific requirements for programs at {TARGET_DOMAINS} university? If not found return "no information found", search only in domain'
        },
        {
            'title': 'Program-specific Scholarships',
            'query': f'Are there scholarships/financial aids for specific programs at {TARGET_DOMAINS} university? If not found return "no information found", search only in domain'
        },
        {
            'title': 'Career Path and Courses',
            'query': f'Find career paths, core courses, and elective courses for every program at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Program Discounts',
            'query': f'What discounts are provided for programs at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Minimum GPA and Required Documents',
            'query': f'What are the minimum GPA requirements and required documents for specific programs at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Interview Requirements and Application Process',
            'query': f'What are the interview requirements and application process steps for specific programs at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Additional Exams and Minimum Scores',
            'query': f'Are additional exams required for specific programs at {TARGET_DOMAINS} university? If yes, list minimum scores per program, search only in domain'
        }
    ]
    
    return [{'title': qt['title'], 'query': qt['query'].format(domain=domain)} for qt in query_templates]


def clean_citation_references(text: str) -> str:
    """
    Remove citation references like [1], [2], [3], [^1^], [^ranking^], etc. from the text.
    Remove <think> block .
    """
    # Pattern to match citation references: [number] or [number][number] etc.
    cleaned_text = re.sub(r'\[\d+\](?:\[\d+\])*', '', text)
    # Pattern to match citation references: [^number^] or [^text^]
    cleaned_text = re.sub(r'\[\^[^\]]+\^\]', '', cleaned_text)
    
    cleaned_text = re.sub(r'<think>.*?</think>', '', cleaned_text, flags=re.DOTALL).strip()
    
    return cleaned_text


def deduplicate_urls(citations: list) -> list:
    """
    Remove duplicate URLs from the citations list while preserving order.
    Also cleans up any malformed URLs.
    """
    seen_urls = set()
    unique_citations = []
    
    for citation in citations:
        # Clean up the URL - remove extra spaces and handle malformed entries
        clean_url = citation.strip()
        
        # Skip empty or very short entries
        if len(clean_url) < 10:
            continue
            
        # Ensure URL starts with http:// or https://
        if not clean_url.startswith(('http://', 'https://')):
            # Skip malformed URLs that don't start with proper protocol
            continue
        
        # Normalize URL by removing trailing slashes and converting to lowercase for comparison
        normalized_url = clean_url.rstrip('/').lower()
        
        # Only add if we haven't seen this URL before
        if normalized_url not in seen_urls:
            seen_urls.add(normalized_url)
            unique_citations.append(clean_url)  # Keep original case for output
    
    return unique_citations



def calc_sonar_pro_cost(meta: dict) -> float:
    """
    Return the USD cost for one Sonar Pro API call.

    Parameters
    ----------
    meta : dict
        A usage-metadata dictionary like the one Perplexity returns, e.g.
        {
            'completion_tokens': 694,
            'prompt_tokens': 37,
            'total_tokens': 731,
            'search_context_size': 'low'
        }

    Returns
    -------
    float
        Cost in US dollars (rounded to 6 decimals).
    """
    
    PRICING = {
    "low":    {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 6 / 1000},
    "medium": {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 10 / 1000},
    "high":   {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 14 / 1000},
    }
    
    
    ctx = meta.get("search_context_size", "low")
    p = PRICING[ctx]

    # convert per-million rates to per-token
    input_rate  = p["input_per_million"]  / 1_000_000
    output_rate = p["output_per_million"] / 1_000_000

    prompt_tokens     = meta.get("prompt_tokens", 0)
    completion_tokens = meta.get("completion_tokens", 0)

    token_cost   = prompt_tokens * input_rate + completion_tokens * output_rate
    request_cost = p["per_request"]

    return round(token_cost + request_cost, 6)


def perplexity_call(perplexity_query: str, domain: str, perplexity_model: str = "sonar-reasoning-pro", temperature: float = 0.01) -> tuple:
    """Calls the Perplexity API with a specific query and domain filter."""
    if not PERPLEXITY_API_KEY:
        print("API Error: PERPLEXITY_API_KEY environment variable not set.")
        return "", {}, []
    url = "https://api.perplexity.ai/chat/completions"
    headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}", "Content-Type": "application/json"}
    payload = {
        "model": perplexity_model,
        "messages": [
            {"role": "system", "content": "You are a specialized web crawler and information extractor. Focus exclusively on the provided domain. Answer the user's query concisely and accurately based on the website's content. If information is not found, clearly state that."},
            {"role": "user", "content": perplexity_query}
        ],
        "temperature": temperature,
        "search_domain_filter": [domain],
        "return_citations": True,
        "web_search_options": {
            "search_context_size": "low"  # This is the key addition
        }
    }
    try:
        response = requests.post(url, headers=headers, json=payload, timeout=120)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"API Request Error: {e}")
        return "", {}, []
    response_json = response.json()
    content = response_json['choices'][0]['message']['content']
    usage = response_json.get('usage', {})
    citations = response_json.get('citations', [])
    return content, usage, citations

def main():
    """Main function to orchestrate the processing of all domains."""
    if not PERPLEXITY_API_KEY:
        print("FATAL: PERPLEXITY_API_KEY is not set. Exiting.")
        return

    # Loop through each domain in the list
    for domain in TARGET_DOMAINS:
        print("\n" + "="*80)
        print(f"STARTING PROCESSING FOR DOMAIN: {domain}")
        print("="*80)

        # 1. Generate dynamic filename and queries for the current domain
        output_filename = f"{domain}_data.txt"
        queries = generate_queries_for_domain(domain)
        
        # 2. Initialize cost tracker for the current domain
        domain_total_cost = 0.0
        
        # 3. Track all URLs across all queries for this domain
        all_domain_urls = set()
        
        with open(output_filename, 'w', encoding='utf-8') as f:
            total_queries = len(queries)
            for i, item in enumerate(queries):
                title = item['title']
                query = item['query']
                
                print(f"\n[{i+1}/{total_queries}] Processing '{title}' for {domain}...")
                
                response_content, usage, citations = perplexity_call(
                    perplexity_query=query,
                    domain=domain
                )
                
                if usage:
                    domain_total_cost += calc_sonar_pro_cost(usage)
                
                # Clean and deduplicate citations
                if citations:
                    # First deduplicate within this query's citations
                    unique_citations = deduplicate_urls(citations)
                    
                    # Then check against all previously seen URLs for this domain
                    final_citations = []
                    for citation in unique_citations:
                        normalized_url = citation.rstrip('/').lower()
                        if normalized_url not in all_domain_urls:
                            all_domain_urls.add(normalized_url)
                            final_citations.append(citation)
                        else:
                            print(f"  Skipped duplicate URL: {citation}")
                    
                    citations = final_citations

                f.write(f"## {title}\n\n")
                if response_content:
                    # Uncomment the next line if you want to include cleaned response content
                    # response_content = clean_citation_references(response_content)
                    # f.write(f"{response_content}\n\n")
                    
                    if citations:
                        f.write("Resources:\n")
                        for cit in citations:
                            f.write(f"- {cit}\n")
                        print(f"  Added {len(citations)} unique URLs for '{title}'")
                    else:
                        f.write("No new unique citations found.\n")
                        print(f"  No unique URLs found for '{title}' (all were duplicates)")
                else:
                    f.write("Failed to retrieve information for this query.\n")
                
                f.write("\n" + "-"*80 + "\n\n")
                print(f"Finished: '{title}'. Data saved.")
                time.sleep(2) # Be polite to the API

        # 4. Print summary for the completed domain
        print("\n" + "-"*80)
        print(f"COMPLETED PROCESSING FOR DOMAIN: {domain}")
        print(f"Report saved to: {output_filename}")
        print(f"Total unique URLs collected: {len(all_domain_urls)}")
        print(f"Total Estimated Cost for this domain: ${domain_total_cost:.4f}")
        print("-"*80)

    print("\n\nALL DOMAINS HAVE BEEN PROCESSED.")



# Run the main function
if __name__ == "__main__":
    main()
    



STARTING PROCESSING FOR DOMAIN: toros.edu.tr

[1/13] Processing 'Programs Names' for toros.edu.tr...


  Added 10 unique URLs for 'Programs Names'
Finished: 'Programs Names'. Data saved.

[2/13] Processing 'Local Students - Tuition Fees' for toros.edu.tr...


# Removed markdown headers and "Resources:" labels 
### exactly the above script just save URLs without markdowns

In [2]:
import requests
import os
import re
import time

# --- Configuration ---
# IMPORTANT: Set your API key as an environment variable for security.
# In your terminal (before starting Jupyter): export PERPLEXITY_API_KEY='your_api_key_here'
# Or, for testing in a notebook (less secure), you can uncomment the line below:
# os.environ['PERPLEXITY_API_KEY'] = 'pplx-...' 

PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")

# Placeholder for the university domain
# TARGET_DOMAINS = ["acsa.sa.edu.au","unipa.it","ue.edu.ph","iuk.ac.jp","yu.edu.sa","akad.de","faculdadeadventista.edu.br","leeds.ac.uk"] #"medipol.edu.tr" ,"cyberjaya.edu.my"
TARGET_DOMAINS = ["cyberjaya.edu.my"] #"medipol.edu.tr" ,"cyberjaya.edu.my"

# Output file name
OUTPUT_FILENAME = f"{TARGET_DOMAINS}_URLS.txt"

def generate_queries_for_domain(domain: str) -> list:
    """Generates a list of query dictionaries for the given domain."""
    # This function now holds the query templates.
    query_templates = [
        {
            'title': 'Programs Names Bachelor',
            'query': f'Return 10 resources from {TARGET_DOMAINS} website listing programs in 2025 for Bachelor and return programs fees and tuition fees page url, search only in domain'
        },
        {
            'title': 'Programs Names Master',
            'query': f'Return 10 resources from {TARGET_DOMAINS} website listing programs in 2025 for Master and return programs fees and tuition fees page url, search only in domain'
        },
        {
            'title': 'Programs Names PhD',
            'query': f'Return 10 resources from {TARGET_DOMAINS} website listing programs in 2025 for PhD and return programs fees and tuition fees page url, search only in domain'
        },
        {
            'title': 'faculties and departments',
            'query': f'Return 10 resources from {TARGET_DOMAINS} website listing faculties and departments in 2025, search only in domain'
        },
        {
            'title': 'Local Students - Tuition Fees',
            'query': f'Return 10 resources from {TARGET_DOMAINS} website about tuition fees in 2025 for local students across all programs, search only in domain'
        },
        {
            'title': 'Local Students - Application Fees and Deposit Fees',
            'query': f'What are the application fees and deposit fees in 2025 for local students at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'International Students - Tuition Fees',
            'query': f'Return 10 resources from {TARGET_DOMAINS} website about tuition fees in 2025 for international students across all programs, search only in domain'
        },
        {
            'title': 'International Students - Application Fees and Deposit Fees',
            'query': f'What are the application fees and deposit fees in 2025 for international students at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Program Coordinators/Director Names, Phones and Emails',
            'query': f'Retrieve program coordinators/director names, phone numbers and emails at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Program Requirements',
            'query': f'Are there any specific requirements for programs at {TARGET_DOMAINS} university? If not found return "no information found", search only in domain'
        },
        {
            'title': 'Program-specific Scholarships',
            'query': f'Are there scholarships/financial aids for specific programs at {TARGET_DOMAINS} university? If not found return "no information found", search only in domain'
        },
        {
            'title': 'Career Path and Courses',
            'query': f'Find career paths, core courses, and elective courses for every program at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Program Discounts',
            'query': f'What discounts are provided for programs in 2025 at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Minimum GPA and Required Documents',
            'query': f'What are the minimum GPA requirements and required documents for specific programs at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Interview Requirements and Application Process',
            'query': f'What are the interview requirements and application process steps for specific programs at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Additional Exams and Minimum Scores',
            'query': f'Are additional exams required for specific programs at {TARGET_DOMAINS} university? If yes, list minimum scores per program, search only in domain'
        }
    ]
    
    return [{'title': qt['title'], 'query': qt['query'].format(domain=domain)} for qt in query_templates]


def clean_citation_references(text: str) -> str:
    """
    Remove citation references like [1], [2], [3], [^1^], [^ranking^], etc. from the text.
    Remove <think> block .
    """
    # Pattern to match citation references: [number] or [number][number] etc.
    cleaned_text = re.sub(r'\[\d+\](?:\[\d+\])*', '', text)
    # Pattern to match citation references: [^number^] or [^text^]
    cleaned_text = re.sub(r'\[\^[^\]]+\^\]', '', cleaned_text)
    
    cleaned_text = re.sub(r'<think>.*?</think>', '', cleaned_text, flags=re.DOTALL).strip()
    
    return cleaned_text


def deduplicate_urls(citations: list) -> list:
    """
    Remove duplicate URLs from the citations list while preserving order.
    Also cleans up any malformed URLs.
    """
    seen_urls = set()
    unique_citations = []
    
    for citation in citations:
        # Clean up the URL - remove extra spaces and handle malformed entries
        clean_url = citation.strip()
        
        # Skip empty or very short entries
        if len(clean_url) < 10:
            continue
            
        # Ensure URL starts with http:// or https://
        if not clean_url.startswith(('http://', 'https://')):
            # Skip malformed URLs that don't start with proper protocol
            continue
        
        # Normalize URL by removing trailing slashes and converting to lowercase for comparison
        normalized_url = clean_url.rstrip('/').lower()
        
        # Only add if we haven't seen this URL before
        if normalized_url not in seen_urls:
            seen_urls.add(normalized_url)
            unique_citations.append(clean_url)  # Keep original case for output
    
    return unique_citations



def calc_sonar_pro_cost(meta: dict) -> float:
    """
    Return the USD cost for one Sonar Pro API call.

    Parameters
    ----------
    meta : dict
        A usage-metadata dictionary like the one Perplexity returns, e.g.
        {
            'completion_tokens': 694,
            'prompt_tokens': 37,
            'total_tokens': 731,
            'search_context_size': 'low'
        }

    Returns
    -------
    float
        Cost in US dollars (rounded to 6 decimals).
    """
    
    PRICING = {
    "low":    {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 6 / 1000},
    "medium": {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 10 / 1000},
    "high":   {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 14 / 1000},
    }
    
    
    ctx = meta.get("search_context_size", "low")
    p = PRICING[ctx]

    # convert per-million rates to per-token
    input_rate  = p["input_per_million"]  / 1_000_000
    output_rate = p["output_per_million"] / 1_000_000

    prompt_tokens     = meta.get("prompt_tokens", 0)
    completion_tokens = meta.get("completion_tokens", 0)

    token_cost   = prompt_tokens * input_rate + completion_tokens * output_rate
    request_cost = p["per_request"]

    return round(token_cost + request_cost, 6)


def perplexity_call(perplexity_query: str, domain: str, perplexity_model: str = "sonar-reasoning-pro", temperature: float = 0.01) -> tuple:
    """Calls the Perplexity API with a specific query and domain filter."""
    if not PERPLEXITY_API_KEY:
        print("API Error: PERPLEXITY_API_KEY environment variable not set.")
        return "", {}, []
    url = "https://api.perplexity.ai/chat/completions"
    headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}", "Content-Type": "application/json"}
    payload = {
        "model": perplexity_model,
        "messages": [
            {"role": "system", "content": "You are a specialized web crawler and information extractor. Focus exclusively on the provided domain. Answer the user's query concisely and accurately based on the website's content. If information is not found, clearly state that."},
            {"role": "user", "content": perplexity_query}
        ],
        "temperature": temperature,
        "search_domain_filter": [domain],
        "return_citations": True,
        "web_search_options": {
            "search_context_size": "high"  # This is the key addition
        }
    }
    try:
        response = requests.post(url, headers=headers, json=payload, timeout=120)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"API Request Error: {e}")
        return "", {}, []
    response_json = response.json()
    content = response_json['choices'][0]['message']['content']
    usage = response_json.get('usage', {})
    citations = response_json.get('citations', [])
    return content, usage, citations

def main():
    """Main function to orchestrate the processing of all domains."""
    if not PERPLEXITY_API_KEY:
        print("FATAL: PERPLEXITY_API_KEY is not set. Exiting.")
        return

    # Loop through each domain in the list
    for domain in TARGET_DOMAINS:
        print("\n" + "="*80)
        print(f"STARTING PROCESSING FOR DOMAIN: {domain}")
        print("="*80)

        # 1. Generate dynamic filename and queries for the current domain
        output_filename = f"{domain}_urls.txt"
        queries = generate_queries_for_domain(domain)
        
        # 2. Initialize cost tracker for the current domain
        domain_total_cost = 0.0
        
        # 3. Track all URLs across all queries for this domain
        all_domain_urls = set()
        
        with open(output_filename, 'w', encoding='utf-8') as f:
            total_queries = len(queries)
            for i, item in enumerate(queries):
                title = item['title']
                query = item['query']
                
                print(f"\n[{i+1}/{total_queries}] Processing '{title}' for {domain}...")
                
                response_content, usage, citations = perplexity_call(
                    perplexity_query=query,
                    domain=domain
                )
                
                if usage:
                    domain_total_cost += calc_sonar_pro_cost(usage)
                
                # Clean and deduplicate citations
                if citations:
                    # First deduplicate within this query's citations
                    unique_citations = deduplicate_urls(citations)
                    
                    # Then check against all previously seen URLs for this domain
                    final_citations = []
                    for citation in unique_citations:
                        normalized_url = citation.rstrip('/').lower()
                        if normalized_url not in all_domain_urls:
                            all_domain_urls.add(normalized_url)
                            final_citations.append(citation)
                        else:
                            print(f"  Skipped duplicate URL: {citation}")
                    
                    citations = final_citations

                # Write URLs directly without markdown formatting or titles
                if citations:
                    # Write each URL on a separate line
                    for citation in citations:
                        f.write(f"{citation}\n")
                    print(f"  Added {len(citations)} unique URLs for '{title}'")
                else:
                    print(f"  No unique URLs found for '{title}' (all were duplicates)")
                
                print(f"Finished: '{title}'. Data saved.")
                time.sleep(2) # Be polite to the API

        # 4. Print summary for the completed domain
        print("\n" + "-"*80)
        print(f"COMPLETED PROCESSING FOR DOMAIN: {domain}")
        print(f"Report saved to: {output_filename}")
        print(f"Total unique URLs collected: {len(all_domain_urls)}")
        print(f"Total Estimated Cost for this domain: ${domain_total_cost:.4f}")
        print("-"*80)

    print("\n\nALL DOMAINS HAVE BEEN PROCESSED.")



# Run the main function
if __name__ == "__main__":
    main()


STARTING PROCESSING FOR DOMAIN: cyberjaya.edu.my

[1/16] Processing 'Programs Names Bachelor' for cyberjaya.edu.my...
  Added 10 unique URLs for 'Programs Names Bachelor'
Finished: 'Programs Names Bachelor'. Data saved.

[2/16] Processing 'Programs Names Master' for cyberjaya.edu.my...
  Skipped duplicate URL: https://cyberjaya.edu.my/admission/financial-aid-options/fees-structure
  Skipped duplicate URL: https://cyberjaya.edu.my/programmes
  Skipped duplicate URL: https://cyberjaya.edu.my/wp-content/uploads/2025/09/UoC-Fee-Structure-Local-AUG-2025-Sept-Update.pdf
  Skipped duplicate URL: https://cyberjaya.edu.my/admission/academic-calendar-for-students
  Skipped duplicate URL: https://cyberjaya.edu.my/admission/financial-aid-options/fees-structure-international
  Skipped duplicate URL: https://cyberjaya.edu.my/admission/financial-aid-options/funding
  Skipped duplicate URL: https://cyberjaya.edu.my/admission/international-admissions
  Added 3 unique URLs for 'Programs Names Master'
F

# script for 200 univ from omar -get programs informations link (Threadpool)

In [None]:
import requests
import os
import re
import time
import pandas as pd
import tldextract
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import threading

# --- Configuration ---
# IMPORTANT: Set your API key as an environment variable for security.
# In your terminal (before starting Jupyter): export PERPLEXITY_API_KEY='your_api_key_here'
# Or, for testing in a notebook (less secure), you can uncomment the line below:
# os.environ['PERPLEXITY_API_KEY'] = 'pplx-...' 

PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")

# Excel file name and output folder
EXCEL_FILE = "50_univ_omar.xlsx"  # Change this to your Excel file name
OUTPUT_FOLDER = "50_uni_sample_perplexity"

# Parallel processing configuration
MAX_WORKERS = 30  # Number of parallel threads
BATCH_SIZE = 30   # Number of domains to process in each batch

# Thread-safe lock for cost tracking and console output
cost_lock = Lock()
print_lock = Lock()

def thread_safe_print(*args, **kwargs):
    """Thread-safe print function."""
    with print_lock:
        print(*args, **kwargs)

def extract_domains_from_excel(excel_file: str) -> list:
    """Extract domains from the Link column in Excel file."""
    try:
        # Read the Excel file
        df = pd.read_excel(excel_file)
        
        # Clean column names by stripping whitespace
        df.columns = df.columns.str.strip()
        
        # Check if 'Link' column exists
        if 'Link' not in df.columns:
            print("Error: 'Link' column not found in Excel file")
            print(f"Available columns: {list(df.columns)}")
            return []
        
        domains = []
        empty_links = 0
        invalid_domains = 0
        error_links = 0
        
        print("\nProcessing domains from Excel file...")
        for index, row in df.iterrows():
            link = row['Link']
            if pd.notna(link) and str(link).strip():  # Check if link is not NaN and not empty
                try:
                    link_str = str(link).strip()
                    # Extract domain components using tldextract
                    ext = tldextract.extract(link_str)
                    # Get the registered domain (without subdomain)
                    domain = f"{ext.domain}.{ext.suffix}"
                    if domain and domain != "." and ext.domain and ext.suffix:  # Make sure we have a valid domain
                        domains.append(domain)
                        if index < 5:  # Show first 5 for debugging
                            print(f"  Row {index+1}: {link_str} -> {domain}")
                    else:
                        invalid_domains += 1
                        if invalid_domains <= 3:  # Show first 3 invalid domains
                            print(f"  Invalid domain at row {index+1}: {link_str} -> {domain}")
                except Exception as e:
                    error_links += 1
                    if error_links <= 3:  # Show first 3 errors
                        print(f"  Error extracting domain from row {index+1} ({link}): {e}")
                    continue
            else:
                empty_links += 1
        
        # Remove duplicates while preserving order and count them
        original_count = len(domains)
        unique_domains = list(dict.fromkeys(domains))
        duplicates_removed = original_count - len(unique_domains)
        
        print(f"\nDomain extraction summary:")
        print(f"  Total entries in Excel: {len(df)}")
        print(f"  Empty/NaN links: {empty_links}")
        print(f"  Invalid domains: {invalid_domains}")
        print(f"  Errors during extraction: {error_links}")
        print(f"  Valid domains found: {original_count}")
        print(f"  Duplicate domains removed: {duplicates_removed}")
        print(f"  Final unique domains: {len(unique_domains)}")
        
        return unique_domains
        
    except FileNotFoundError:
        print(f"Error: Excel file '{excel_file}' not found")
        return []
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return []

def generate_queries_for_domain(domain: str) -> list:
    """Generates a list of query dictionaries for the given domain."""
    query_templates = [
        {
            'title': 'Programs Names Bachelor',
            'query': f'Return all resources from {TARGET_DOMAINS} website listing programs in 2025 for Bachelor and return programs fees and tuition fees page url, search only in domain'
        },
        {
            'title': 'Programs Names Master',
            'query': f'Return all resources from {TARGET_DOMAINS} website listing programs in 2025 for Master and return programs fees and tuition fees page url, search only in domain'
        },
        {
            'title': 'Programs Names PhD',
            'query': f'Return all resources from {TARGET_DOMAINS} website listing programs in 2025 for PhD and return programs fees and tuition fees page url, search only in domain'
        },
        {
            'title': 'faculties and departments',
            'query': f'Return all resources from {TARGET_DOMAINS} website listing faculties and departments in 2025, search only in domain'
        },
        {
            'title': 'Local Students - Tuition Fees',
            'query': f'Return all resources from {TARGET_DOMAINS} website about tuition fees in 2025 for local students across all programs, search only in domain'
        },
        {
            'title': 'Local Students - Application Fees and Deposit Fees',
            'query': f'What are the application fees and deposit fees in 2025 for local students at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'International Students - Tuition Fees',
            'query': f'Return all resources from {TARGET_DOMAINS} website about tuition fees in 2025 for international students across all programs, search only in domain'
        },
        {
            'title': 'International Students - Application Fees and Deposit Fees',
            'query': f'What are the application fees and deposit fees in 2025 for international students at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Program Coordinators/Director Names, Phones and Emails',
            'query': f'Retrieve program coordinators/director names, phone numbers and emails at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Program Requirements',
            'query': f'Are there any specific requirements for programs at {TARGET_DOMAINS} university? If not found return "no information found", search only in domain'
        },
        {
            'title': 'Program-specific Scholarships',
            'query': f'Are there scholarships/financial aids for specific programs at {TARGET_DOMAINS} university? If not found return "no information found", search only in domain'
        },
        {
            'title': 'Career Path and Courses',
            'query': f'Find career paths, core courses, and elective courses for every program at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Program Discounts',
            'query': f'What discounts are provided for programs in 2025 at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Minimum GPA and Required Documents',
            'query': f'What are the minimum GPA requirements and required documents for specific programs at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Interview Requirements and Application Process',
            'query': f'What are the interview requirements and application process steps for specific programs at {TARGET_DOMAINS} university, search only in domain'
        },
        {
            'title': 'Additional Exams and Minimum Scores',
            'query': f'Are additional exams required for specific programs at {TARGET_DOMAINS} university? If yes, list minimum scores per program, search only in domain'
        }
    ]
    
    return query_templates

def clean_citation_references(text: str) -> str:
    """
    Remove citation references like [1], [2], [3], [^1^], [^ranking^], etc. from the text.
    Remove <think> block .
    """
    # Pattern to match citation references: [number] or [number][number] etc.
    cleaned_text = re.sub(r'\[\d+\](?:\[\d+\])*', '', text)
    # Pattern to match citation references: [^number^] or [^text^]
    cleaned_text = re.sub(r'\[\^[^\]]+\^\]', '', cleaned_text)
    
    cleaned_text = re.sub(r'<think>.*?</think>', '', cleaned_text, flags=re.DOTALL).strip()
    
    return cleaned_text

def deduplicate_urls(citations: list) -> list:
    """
    Remove duplicate URLs from the citations list while preserving order.
    Also cleans up any malformed URLs.
    """
    seen_urls = set()
    unique_citations = []
    
    for citation in citations:
        # Clean up the URL - remove extra spaces and handle malformed entries
        clean_url = citation.strip()
        
        # Skip empty or very short entries
        if len(clean_url) < 10:
            continue
            
        # Ensure URL starts with http:// or https://
        if not clean_url.startswith(('http://', 'https://')):
            # Skip malformed URLs that don't start with proper protocol
            continue
        
        # Normalize URL by removing trailing slashes and converting to lowercase for comparison
        normalized_url = clean_url.rstrip('/').lower()
        
        # Only add if we haven't seen this URL before
        if normalized_url not in seen_urls:
            seen_urls.add(normalized_url)
            unique_citations.append(clean_url)  # Keep original case for output
    
    return unique_citations

def calc_sonar_pro_cost(meta: dict) -> float:
    """
    Return the USD cost for one Sonar Pro API call.

    Parameters
    ----------
    meta : dict
        A usage-metadata dictionary like the one Perplexity returns, e.g.
        {
            'completion_tokens': 694,
            'prompt_tokens': 37,
            'total_tokens': 731,
            'search_context_size': 'low'
        }

    Returns
    -------
    float
        Cost in US dollars (rounded to 6 decimals).
    """
    
    PRICING = {
    "low":    {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 6 / 1000},
    "medium": {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 10 / 1000},
    "high":   {"input_per_million": 3.0, "output_per_million": 15.0,
               "per_request": 14 / 1000},
    }
    
    ctx = meta.get("search_context_size", "low")
    p = PRICING[ctx]

    # convert per-million rates to per-token
    input_rate  = p["input_per_million"]  / 1_000_000
    output_rate = p["output_per_million"] / 1_000_000

    prompt_tokens     = meta.get("prompt_tokens", 0)
    completion_tokens = meta.get("completion_tokens", 0)

    token_cost   = prompt_tokens * input_rate + completion_tokens * output_rate
    request_cost = p["per_request"]

    return round(token_cost + request_cost, 6)

def perplexity_call(perplexity_query: str, domain: str, perplexity_model: str = "sonar-reasoning-pro", temperature: float = 0.01) -> tuple:
    """Calls the Perplexity API with a specific query and domain filter."""
    if not PERPLEXITY_API_KEY:
        thread_safe_print("API Error: PERPLEXITY_API_KEY environment variable not set.")
        return "", {}, []
    url = "https://api.perplexity.ai/chat/completions"
    headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}", "Content-Type": "application/json"}
    payload = {
        "model": perplexity_model,
        "messages": [
            {"role": "system", "content": "You are a specialized web crawler and information extractor. Focus exclusively on the provided domain. Answer the user's query concisely and accurately based on the website's content. If information is not found, clearly state that."},
            {"role": "user", "content": perplexity_query}
        ],
        "temperature": temperature,
        "search_domain_filter": [domain],
        "return_citations": True,
        "web_search_options": {
            "search_context_size": "high"  # This is the key addition
        }
    }
    try:
        response = requests.post(url, headers=headers, json=payload, timeout=120)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        thread_safe_print(f"API Request Error for {domain}: {e}")
        return "", {}, []
    response_json = response.json()
    content = response_json['choices'][0]['message']['content']
    usage = response_json.get('usage', {})
    citations = response_json.get('citations', [])
    return content, usage, citations

def process_single_domain(domain: str, domain_index: int, total_domains: int) -> dict:
    """Process a single domain and return results."""
    thread_id = threading.current_thread().name
    thread_safe_print(f"\n[Thread {thread_id}] STARTING DOMAIN [{domain_index}/{total_domains}]: {domain}")
    
    try:
        # Generate dynamic filename for the current domain
        safe_domain_name = domain.replace('.', '_')  # Replace dots with underscores for filename
        output_filename = os.path.join(OUTPUT_FOLDER, f"{safe_domain_name}_urls.txt")
        queries = generate_queries_for_domain(domain)
        
        # Initialize cost tracker for the current domain
        domain_total_cost = 0.0
        
        # Track all URLs across all queries for this domain
        all_domain_urls = set()
        
        with open(output_filename, 'w', encoding='utf-8') as f:
            total_queries = len(queries)
            for i, item in enumerate(queries):
                title = item['title']
                query = item['query']
                
                thread_safe_print(f"[Thread {thread_id}] [{i+1}/{total_queries}] Processing '{title}' for {domain}...")
                
                response_content, usage, citations = perplexity_call(
                    perplexity_query=query,
                    domain=domain
                )
                
                if usage:
                    domain_total_cost += calc_sonar_pro_cost(usage)
                
                # Clean and deduplicate citations
                if citations:
                    # First deduplicate within this query's citations
                    unique_citations = deduplicate_urls(citations)
                    
                    # Then check against all previously seen URLs for this domain
                    final_citations = []
                    for citation in unique_citations:
                        normalized_url = citation.rstrip('/').lower()
                        if normalized_url not in all_domain_urls:
                            all_domain_urls.add(normalized_url)
                            final_citations.append(citation)
                        else:
                            thread_safe_print(f"  [Thread {thread_id}] Skipped duplicate URL: {citation}")
                    
                    citations = final_citations

                # Write URLs directly without markdown formatting or titles
                if citations:
                    # Write each URL on a separate line
                    for citation in citations:
                        f.write(f"{citation}\n")
                    thread_safe_print(f"  [Thread {thread_id}] Added {len(citations)} unique URLs for '{title}'")
                else:
                    thread_safe_print(f"  [Thread {thread_id}] No unique URLs found for '{title}' (all were duplicates)")
                
                # Small delay to be polite to the API
                time.sleep(0.5)

        # Return results for this domain
        result = {
            'domain': domain,
            'domain_index': domain_index,
            'output_filename': output_filename,
            'total_urls': len(all_domain_urls),
            'domain_cost': domain_total_cost,
            'thread_id': thread_id
        }
        
        thread_safe_print(f"[Thread {thread_id}] COMPLETED DOMAIN: {domain} | URLs: {len(all_domain_urls)} | Cost: ${domain_total_cost:.4f}")
        return result
        
    except Exception as e:
        thread_safe_print(f"[Thread {thread_id}] ERROR processing {domain}: {e}")
        return {
            'domain': domain,
            'domain_index': domain_index,
            'error': str(e),
            'thread_id': thread_id
        }

def process_domains_in_parallel(domains_batch: list, batch_start_index: int) -> list:
    """Process a batch of domains in parallel using ThreadPoolExecutor."""
    thread_safe_print(f"\n{'='*80}")
    thread_safe_print(f"STARTING PARALLEL BATCH: {len(domains_batch)} domains")
    thread_safe_print(f"Domains: {', '.join(domains_batch[:5])}{'...' if len(domains_batch) > 5 else ''}")
    thread_safe_print(f"{'='*80}")
    
    results = []
    
    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all tasks
        future_to_domain = {}
        for i, domain in enumerate(domains_batch):
            domain_index = batch_start_index + i + 1
            future = executor.submit(process_single_domain, domain, domain_index, len(domains_batch))
            future_to_domain[future] = domain
        
        # Collect results as they complete
        for future in as_completed(future_to_domain):
            domain = future_to_domain[future]
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                thread_safe_print(f"ERROR: Exception occurred for domain {domain}: {e}")
                results.append({
                    'domain': domain,
                    'error': str(e)
                })
    
    return results

def main():
    """Main function to orchestrate the processing of all domains."""
    if not PERPLEXITY_API_KEY:
        print("FATAL: PERPLEXITY_API_KEY is not set. Exiting.")
        return

    # Extract domains from Excel file
    target_domains = extract_domains_from_excel(EXCEL_FILE)
    
    if not target_domains:
        print("No domains found in Excel file. Exiting.")
        return

    # Create output folder if it doesn't exist
    Path(OUTPUT_FOLDER).mkdir(exist_ok=True)
    print(f"Output folder '{OUTPUT_FOLDER}' created/verified")

    print(f"\nStarting parallel processing with {MAX_WORKERS} threads")
    print(f"Processing {len(target_domains)} domains in batches of {BATCH_SIZE}")

    # Initialize overall cost tracker
    total_overall_cost = 0.0
    total_urls_collected = 0
    successful_domains = 0
    failed_domains = 0
    
    # Process domains in batches
    for batch_start in range(0, len(target_domains), BATCH_SIZE):
        batch_end = min(batch_start + BATCH_SIZE, len(target_domains))
        domains_batch = target_domains[batch_start:batch_end]
        
        batch_number = (batch_start // BATCH_SIZE) + 1
        total_batches = (len(target_domains) + BATCH_SIZE - 1) // BATCH_SIZE
        
        print(f"\n{'*'*80}")
        print(f"PROCESSING BATCH {batch_number}/{total_batches}")
        print(f"Domains {batch_start + 1} to {batch_end} of {len(target_domains)}")
        print(f"{'*'*80}")
        
        # Process this batch in parallel
        batch_results = process_domains_in_parallel(domains_batch, batch_start)
        
        # Aggregate results from this batch
        batch_cost = 0.0
        batch_urls = 0
        
        for result in batch_results:
            if 'error' not in result:
                batch_cost += result.get('domain_cost', 0)
                batch_urls += result.get('total_urls', 0)
                successful_domains += 1
                print(f"✓ {result['domain']}: {result['total_urls']} URLs, ${result['domain_cost']:.4f}")
            else:
                failed_domains += 1
                print(f"✗ {result['domain']}: ERROR - {result.get('error', 'Unknown error')}")
        
        total_overall_cost += batch_cost
        total_urls_collected += batch_urls
        
        print(f"\nBatch {batch_number} Summary:")
        print(f"  Batch Cost: ${batch_cost:.4f}")
        print(f"  Batch URLs: {batch_urls}")
        print(f"  Running Total Cost: ${total_overall_cost:.4f}")
        print(f"  Running Total URLs: {total_urls_collected}")

    print("\n" + "="*80)
    print("ALL DOMAINS PROCESSING COMPLETED!")
    print("="*80)
    print(f"Total domains processed: {len(target_domains)}")
    print(f"Successful domains: {successful_domains}")
    print(f"Failed domains: {failed_domains}")
    print(f"Total URLs collected: {total_urls_collected}")
    print(f"Total estimated cost: ${total_overall_cost:.4f}")
    print(f"All output files saved in: {OUTPUT_FOLDER}")
    print("="*80)

# Run the main function
if __name__ == "__main__":
    main()


Processing domains from Excel file...
  Row 1: https://kardan.edu.af/ -> kardan.edu.af
  Row 2: https://aburayhan.edu.af/ -> aburayhan.edu.af
  Row 3: https://acsa.sa.edu.au/ -> acsa.sa.edu.au
  Row 4: https://www.ac.edu.au/ -> ac.edu.au
  Row 5: https://maiwand.edu.af/ -> maiwand.edu.af

Domain extraction summary:
  Total entries in Excel: 49
  Empty/NaN links: 0
  Invalid domains: 0
  Errors during extraction: 0
  Valid domains found: 49
  Duplicate domains removed: 0
  Final unique domains: 49
Output folder '50_uni_sample_perplexity' created/verified

Starting parallel processing with 30 threads
Processing 49 domains in batches of 30

********************************************************************************
PROCESSING BATCH 1/2
Domains 1 to 30 of 49
********************************************************************************

STARTING PARALLEL BATCH: 30 domains
Domains: kardan.edu.af, aburayhan.edu.af, acsa.sa.edu.au, ac.edu.au, maiwand.edu.af...

[Thread ThreadPoolExecu

# NEW-Perplexity search api

In [None]:
! pip install perplexityai

In [13]:
import os
from dotenv import load_dotenv
from perplexity import Perplexity

load_dotenv()

client = Perplexity() # Uses PERPLEXITY_API_KEY from .env file
TARGET_DOMAINS= 'cyberjaya.edu.my'
search = client.search.create(
    query=[
           f"Return 10 resources from {TARGET_DOMAINS} website listing programs in 2025 for Master and return programs fees and tuition fees page url, search only in domain",
           f"Return 10 resources from {TARGET_DOMAINS} website listing programs in 2025 for PHD and doctorate and return programs fees and tuition fees page url, search only in domain",
           f"Return 10 resources from {TARGET_DOMAINS} website listing programs in 2025 for Bachelor and return programs fees and tuition fees page url, search only in domain",
    ],
    max_results=10
)

# Results are combined and ranked
for i, result in enumerate(search.results):
    print(f"{i + 1}. {result.title}")
    # print(f"   search result:  {search.results}")
    print(f"   URL: {result.url}")
    print(f"   Date: {result.date}\n")

1. Fees Structure International | University of Cyberjaya
   URL: https://cyberjaya.edu.my/admission/financial-aid-options/fees-structure-international
   Date: 2025-08-25

2. Explore All Our Programmes
   URL: https://cyberjaya.edu.my/programmes
   Date: 2024-02-13

3. University of Cyberjaya - Premier University in Malaysia
   URL: https://cyberjaya.edu.my
   Date: 2025-06-09

4. Cyberjaya university Fees 2024-2025
   URL: https://www.scribd.com/document/858863923/Cyberjaya-university-Fees-2024-2025
   Date: 2025-06-14

5. University of Cyberjaya Fees & Courses 2025 | Apply With Uni Enrol
   URL: https://articles.unienrol.com/university-of-cyberjaya-fees-courses/
   Date: 2025-04-24

6. University of Cyberjaya (UoC) Courses, Ranking, Tuition Fees
   URL: https://globalassistant.info/university-of-cyberjaya-uoc-courses-ranking-tuition-fees-scholarship/
   Date: 2025-09-24

7. Cyberjaya Education Group Berhad
   URL: https://cyberjaya.education
   Date: 2025-07-14

8. MMU: Discover Exc

In [19]:
import os
from dotenv import load_dotenv
from perplexity import Perplexity

# Load API key from .env
load_dotenv()

client = Perplexity()  # Uses PERPLEXITY_API_KEY from .env file
TARGET_DOMAINS = 'cyberjaya.edu.my'

# Perform the search
search = client.search.create(
    query=f"Return 10 resources from {TARGET_DOMAINS} website listing programs in 2025 for Master and return programs fees and tuition fees page url, search only in domain",
    max_results=8
)

# Display results neatly
for i, result in enumerate(search.results, start=1):
    # clean snippet: remove multiple newlines & strip spaces
    snippet = result.snippet.replace("\n\n", "\n").strip()
    
    print(f"--->url ({i}) {result.url}")
    print(f"   Title: {result.title}")
    print(f"   Snippet: {snippet}")
    print(f"   Date: {result.date}\n")


--->url (1) https://cyberjaya.edu.my/programmes
   Title: Explore All Our Programmes
   Snippet: Find Programmes
Certificate
Foundation
Diploma
Bachelor
Masters
PhD & Research
Browse All
UNIVERSITY
About
Campus Life
Admission
Faculties & Centres
News & Events
Leadership
UoC Health & Psychology Clinic
QUICKLINKS
Quicklinks for Students
Staff Quicklinks
Contact
Research
Colleges
Cyberjaya College Central
Cyberjaya College Kota Kinabalu
Cyberjaya College Kuching
Admission Link
Guidelines
Online Application
Scholarship
Home
Find a Programme
Gain interdisciplinary knowledge and graduate career-ready with a comparable degree aligned to leading universities worldwide.
Search
Search
Search
Clear
By Levels
By Levels
By Levels
All Level
Bachelor (21)
Certificate (4)
Diploma (14)
Foundation (4)
Masters (14)
PhD (4)
PhD/Doctorate (1)
By Faculty
By Faculty
All
Business & Technology
Allied Health Science
Psychology & Social Science
The Centre for Foundation, Languages and General Studies
Creative Ar

# Sonar model-NEW-Perplexity search api

In [31]:
import os
from dotenv import load_dotenv
from perplexity import Perplexity

# Load API key from .env
load_dotenv()

client = Perplexity()

# Set your target domain(s)
TARGET_DOMAINS = ["cyberjaya.edu.my"]

# Ask a general university-level question
query = f" listing programs in 2025 for Bachelor and return programs fees and tuition fees page url, search only in domain"

response = client.chat.completions.create(
    model="sonar",  # or "sonar-pro" for better quality
    messages=[
        {
            "role": "system", 
            "content": "You are a helpful research assistant. Provide comprehensive answers with proper citations."
        },
        {
            "role": "user",
            "content": query
        }
    ],
    search_domain_filter=TARGET_DOMAINS,
    max_tokens=2
)
print("full response:")
print(response)
# ---- Format the output ----
print("\n=== University Vision Summary ===\n")
print(response.choices[0].message.content.strip())

print("\n=== Citations ===")
for url in response.citations:
    print(f"- {url}")

print("\n=== Search Results ===")
for i, r in enumerate(response.search_results, 1):
    snippet = r.snippet.replace("\n\n", " ").strip()
    print(f"{i}. {r.title}")
    print(f"   URL: {r.url}")
    if r.date:
        print(f"   Date: {r.date}")
    if r.last_updated:
        print(f"   Last Updated: {r.last_updated}")
    print(f"   Snippet: {snippet}\n")

# ---- Show cost details ----
print("\n=== Cost Info ===")
print(f"Total Cost: ${response.usage.cost.total_cost:.4f}")
print(f"  • Input tokens cost: ${response.usage.cost.input_tokens_cost}")
print(f"  • Output tokens cost: ${response.usage.cost.output_tokens_cost}")
print(f"  • Request cost: ${response.usage.cost.request_cost}")
if response.usage.cost.citation_tokens_cost is not None:
    print(f"  • Citation tokens cost: ${response.usage.cost.citation_tokens_cost}")
if response.usage.cost.search_queries_cost is not None:
    print(f"  • Search queries cost: ${response.usage.cost.search_queries_cost}")



full response:
CompletionCreateResponse(id='7dd4277d-fff8-4138-a981-1ce58dcb1090', choices=[Choice(delta=ChatMessageOutput(content='', role='assistant', reasoning_steps=None, tool_calls=None), index=0, message=ChatMessageOutput(content='For', role='assistant', reasoning_steps=None, tool_calls=None), finish_reason='length')], created=1759239827, model='sonar', usage=UsageInfo(completion_tokens=2, cost=Cost(input_tokens_cost=0.0, output_tokens_cost=0.0, total_cost=0.005, citation_tokens_cost=None, reasoning_tokens_cost=None, request_cost=0.005, search_queries_cost=None), prompt_tokens=36, total_tokens=38, citation_tokens=None, num_search_queries=None, reasoning_tokens=None, search_context_size='low'), citations=['https://cyberjaya.edu.my/admission/financial-aid-options/fees-structure-international', 'https://cyberjaya.edu.my/admission/financial-aid-options/fees-structure', 'https://cyberjaya.edu.my/admission/financial-aid-options/funding', 'https://cyberjaya.edu.my/funding/uoc-dietetics-

In [None]:
import perplexity
from perplexity import Perplexity

client = Perplexity()

try:
    # Set your target domain(s)
    TARGET_DOMAINS = ["cyberjaya.edu.my"]

    # Ask a general university-level question
    query = f" listing programs in 2025 for Bachelor and return programs fees and tuition fees page url, search only in domain"

    response = client.chat.completions.create(
        model="sonar",  # or "sonar-pro" for better quality
        messages=[
            {
                "role": "system", 
                "content": "You are a helpful research assistant. Provide comprehensive answers with proper citations."
            },
            {
                "role": "user",
                "content": query
            }
        ],
        search_domain_filter=TARGET_DOMAINS,
        max_tokens=2
    )
except perplexity.BadRequestError as e:
    print(f"Invalid search parameters: {e}")
except perplexity.RateLimitError as e:
    print("Search rate limit exceeded")
except perplexity.APIStatusError as e:
    print(f"Search API error {e.status_code}: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")

Invalid search parameters: Error code: 400 - {'error': {'message': 'Message content was empty', 'type': 'invalid_message', 'code': 400}}
