Python Scripting Guide
Use SciLEx as a Python library to integrate paper collection into your own scripts and workflows.
Setup
All SciLEx modules in src/ rely on YAML config files. Add src/ to your Python path before importing:
import sys
import os
# Add src/ to path so crawlers and other modules are importable
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
Load configs using yaml:
import yaml
with open("src/scilex.config.yml") as f:
main_config = yaml.safe_load(f)
with open("scilex/api.config.yml") as f:
api_config = yaml.safe_load(f)
Or build configs entirely in Python (no YAML files needed):
main_config = {
"keywords": [["machine learning", "deep learning"], ["healthcare"]],
"years": [2024, 2025],
"apis": ["SemanticScholar", "OpenAlex"],
"output_dir": "output",
"collect_name": "collect_20250101_120000",
"collect": True,
"aggregate_get_citations": False,
"aggregate_file": "aggregated_results.csv",
}
api_config = {
"SemanticScholar": {},
"OpenAlex": {},
}
Collect Papers
Run API collection programmatically using CollectCollection:
import os
import sys
import yaml
sys.path.insert(0, 'src')
from crawlers.collector_collection import CollectCollection
# Ensure output directory exists
output_dir = main_config.get("output_dir", "output")
if not os.path.isdir(output_dir):
os.makedirs(output_dir)
# Save config snapshot (required for aggregation)
with open(os.path.join(output_dir, "config_used.yml"), "w") as f:
yaml.dump(main_config, f)
# Run collection
collector = CollectCollection(main_config, api_config)
collector.create_collects_jobs()
Aggregate and Filter
The aggregation script reads config at import time. Invoke it via sys.argv:
import sys
sys.path.insert(0, 'src')
# Set arguments before importing
sys.argv = ["aggregate", "--skip-citations", "--workers", "3"]
from aggregate_collect import main as aggregate_main
aggregate_main()
Then read the results:
import pandas as pd
csv_path = "output/collect_20250101_120000/aggregated_results.csv"
df = pd.read_csv(csv_path, delimiter=";")
print(f"Total papers: {len(df)}")
print(f"Papers by year:\n{df['year'].value_counts().sort_index()}")
print(f"\nTop 10 cited:")
print(df.nlargest(10, "nb_citation")[["title", "nb_citation"]])
Export to BibTeX
import sys
sys.path.insert(0, 'src')
from export_to_bibtex import main as bibtex_main
bibtex_main()
# Creates: output/collect_*/aggregated_results.bib
Push to Zotero
import sys
sys.path.insert(0, 'src')
from push_to_Zotero_collect import main as zotero_main
zotero_main()
Full Pipeline Script
A complete end-to-end script combining all steps:
"""Full SciLEx pipeline: collect, aggregate, export."""
import os
import sys
import yaml
# ── Path setup ────────────────────────────────────────────────────
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
# ── 1. Configuration ──────────────────────────────────────────────
main_config = {
"keywords": [["large language model", "LLM"], ["evaluation", "benchmark"]],
"years": [2024, 2025],
"apis": ["SemanticScholar", "OpenAlex", "Arxiv"],
"output_dir": "output",
"collect_name": "llm_benchmarks",
"collect": True,
"aggregate_get_citations": False,
"aggregate_file": "aggregated_results.csv",
"quality_filters": {
"enable_itemtype_filter": True,
"allowed_item_types": ["journalArticle", "conferencePaper", "preprint"],
"apply_relevance_ranking": True,
"max_papers": 200,
},
}
api_config = {
"SemanticScholar": {},
"OpenAlex": {},
}
# ── 2. Collection ─────────────────────────────────────────────────
from crawlers.collector_collection import CollectCollection
output_dir = main_config["output_dir"]
os.makedirs(output_dir, exist_ok=True)
config_path = os.path.join(output_dir, "config_used.yml")
if not os.path.exists(config_path):
with open(config_path, "w") as f:
yaml.dump(main_config, f)
collector = CollectCollection(main_config, api_config)
collector.create_collects_jobs()
print("Collection complete.")
# ── 3. Aggregation ────────────────────────────────────────────────
sys.argv = ["aggregate", "--skip-citations"]
from aggregate_collect import main as aggregate_main
aggregate_main()
print("Aggregation complete.")
# ── 4. Analyze results ────────────────────────────────────────────
import pandas as pd
csv_path = os.path.join(
output_dir, main_config["collect_name"], "aggregated_results.csv"
)
df = pd.read_csv(csv_path, delimiter=";")
print(f"\nResults: {len(df)} papers")
print(f"Sources: {df['archive'].value_counts().to_dict()}")
print(f"Years: {df['year'].value_counts().sort_index().to_dict()}")
Important Notes
Working directory: Run scripts from the project root so relative paths resolve correctly.
Path setup: Always add
src/tosys.pathbefore importing SciLEx modules.Multiprocessing: Collection uses spawn mode. Always run collection code inside an
if __name__ == "__main__":guard.sys.argv: Modules that use
argparseparsesys.argvin theirmain(). Setsys.argvbefore callingmain()to pass arguments programmatically.CSV delimiter: The output CSV uses
;as delimiter, not,.
Next Steps
Basic Workflow - CLI-based workflow
Advanced Filtering - Filtering options
Configuration - All config parameters