Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- openenv_aegis_env.egg-info/PKG-INFO +1 -1
- openenv_aegis_env.egg-info/requires.txt +1 -1
- pyproject.toml +3 -1
- server/aegis_env_environment.py +146 -27
openenv_aegis_env.egg-info/PKG-INFO
CHANGED
|
@@ -6,8 +6,8 @@ Requires-Python: >=3.10
|
|
| 6 |
Requires-Dist: openenv-core[core]>=0.2.2
|
| 7 |
Requires-Dist: openai>=1.0.0
|
| 8 |
Requires-Dist: python-dotenv>=1.0.0
|
| 9 |
-
Requires-Dist: datasets>=2.19.0
|
| 10 |
Requires-Dist: huggingface-hub>=0.23.0
|
|
|
|
| 11 |
Provides-Extra: dev
|
| 12 |
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
| 13 |
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
|
|
| 6 |
Requires-Dist: openenv-core[core]>=0.2.2
|
| 7 |
Requires-Dist: openai>=1.0.0
|
| 8 |
Requires-Dist: python-dotenv>=1.0.0
|
|
|
|
| 9 |
Requires-Dist: huggingface-hub>=0.23.0
|
| 10 |
+
Requires-Dist: datasets>=2.19.0
|
| 11 |
Provides-Extra: dev
|
| 12 |
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
| 13 |
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
openenv_aegis_env.egg-info/requires.txt
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
openenv-core[core]>=0.2.2
|
| 2 |
openai>=1.0.0
|
| 3 |
python-dotenv>=1.0.0
|
| 4 |
-
datasets>=2.19.0
|
| 5 |
huggingface-hub>=0.23.0
|
|
|
|
| 6 |
|
| 7 |
[dev]
|
| 8 |
pytest>=8.0.0
|
|
|
|
| 1 |
openenv-core[core]>=0.2.2
|
| 2 |
openai>=1.0.0
|
| 3 |
python-dotenv>=1.0.0
|
|
|
|
| 4 |
huggingface-hub>=0.23.0
|
| 5 |
+
datasets>=2.19.0
|
| 6 |
|
| 7 |
[dev]
|
| 8 |
pytest>=8.0.0
|
pyproject.toml
CHANGED
|
@@ -18,8 +18,10 @@ dependencies = [
|
|
| 18 |
# install from github
|
| 19 |
# "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
|
| 20 |
"openenv-core[core]>=0.2.2",
|
| 21 |
-
"pymongo>=4.6.0",
|
| 22 |
"openai>=1.0.0",
|
|
|
|
|
|
|
|
|
|
| 23 |
# Environment-specific dependencies
|
| 24 |
# Add all dependencies needed for your environment here
|
| 25 |
# Examples:
|
|
|
|
| 18 |
# install from github
|
| 19 |
# "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
|
| 20 |
"openenv-core[core]>=0.2.2",
|
|
|
|
| 21 |
"openai>=1.0.0",
|
| 22 |
+
"python-dotenv>=1.0.0",
|
| 23 |
+
"huggingface-hub>=0.23.0",
|
| 24 |
+
"datasets>=2.19.0",
|
| 25 |
# Environment-specific dependencies
|
| 26 |
# Add all dependencies needed for your environment here
|
| 27 |
# Examples:
|
server/aegis_env_environment.py
CHANGED
|
@@ -7,13 +7,16 @@
|
|
| 7 |
"""
|
| 8 |
AEGIS-Env: automated grading simulation with deterministic rewards.
|
| 9 |
|
| 10 |
-
|
|
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
from __future__ import annotations
|
| 14 |
|
|
|
|
| 15 |
import os
|
| 16 |
import random
|
|
|
|
| 17 |
from typing import Any, Dict, List, Optional
|
| 18 |
from uuid import uuid4
|
| 19 |
|
|
@@ -46,6 +49,145 @@ def _jaccard(a_text: str, b_text: str) -> float:
|
|
| 46 |
return float(inter) / float(union) if union else 0.0
|
| 47 |
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
class AegisEnvironment(Environment[AegisAction, AegisObservation, State]):
|
| 50 |
"""
|
| 51 |
Single-step grading episode: reset samples a row; step scores the agent output.
|
|
@@ -68,33 +210,10 @@ class AegisEnvironment(Environment[AegisAction, AegisObservation, State]):
|
|
| 68 |
self.current_reference_feedback: str = ""
|
| 69 |
self.current_max_score: float = 1.0
|
| 70 |
|
| 71 |
-
uri = os.environ.get("MONGO_URI")
|
| 72 |
-
if not uri:
|
| 73 |
-
self._load_error = "MONGO_URI is not set; dataset is empty."
|
| 74 |
-
return
|
| 75 |
-
|
| 76 |
try:
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
client = MongoClient(uri, serverSelectionTimeoutMS=10_000)
|
| 80 |
-
try:
|
| 81 |
-
coll = client["AEGIS"]["AEGIS-Eval-v2"]
|
| 82 |
-
projection = {
|
| 83 |
-
"dataset": 1,
|
| 84 |
-
"question": 1,
|
| 85 |
-
"rubrics": 1,
|
| 86 |
-
"student_response": 1,
|
| 87 |
-
"max_score": 1,
|
| 88 |
-
"obtained_score": 1,
|
| 89 |
-
"reference_feedback": 1,
|
| 90 |
-
}
|
| 91 |
-
cursor = coll.find({}, projection)
|
| 92 |
-
for doc in cursor:
|
| 93 |
-
self.dataset.append(doc)
|
| 94 |
-
finally:
|
| 95 |
-
client.close()
|
| 96 |
except Exception as e:
|
| 97 |
-
self._load_error = f"
|
| 98 |
self.dataset = []
|
| 99 |
|
| 100 |
def reset(
|
|
@@ -140,7 +259,7 @@ class AegisEnvironment(Environment[AegisAction, AegisObservation, State]):
|
|
| 140 |
|
| 141 |
# Store the ground truth for the deterministic reward calculation in step()
|
| 142 |
self.current_ground_truth = float(selected_record.get("obtained_score", 0.0))
|
| 143 |
-
self.current_reference_feedback =
|
| 144 |
self.current_max_score = float(selected_record.get("max_score", 1.0) or 1.0)
|
| 145 |
|
| 146 |
# Ensure rubrics are handled even if missing (like in ASAP-SAS)
|
|
|
|
| 7 |
"""
|
| 8 |
AEGIS-Env: automated grading simulation with deterministic rewards.
|
| 9 |
|
| 10 |
+
The dataset is downloaded from Hugging Face and cached on disk; ``reset`` and
|
| 11 |
+
``step`` are CPU-only.
|
| 12 |
"""
|
| 13 |
|
| 14 |
from __future__ import annotations
|
| 15 |
|
| 16 |
+
import json
|
| 17 |
import os
|
| 18 |
import random
|
| 19 |
+
from pathlib import Path
|
| 20 |
from typing import Any, Dict, List, Optional
|
| 21 |
from uuid import uuid4
|
| 22 |
|
|
|
|
| 49 |
return float(inter) / float(union) if union else 0.0
|
| 50 |
|
| 51 |
|
| 52 |
+
def _load_dotenv_if_available() -> None:
|
| 53 |
+
try:
|
| 54 |
+
from dotenv import load_dotenv
|
| 55 |
+
|
| 56 |
+
load_dotenv(override=True)
|
| 57 |
+
except Exception:
|
| 58 |
+
pass
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _cache_dir() -> Path:
|
| 62 |
+
# Default to a repo-local cache so it works in sandboxed runners.
|
| 63 |
+
# You can override via AEGIS_CACHE_DIR / HF_HOME / XDG_CACHE_HOME.
|
| 64 |
+
root = (
|
| 65 |
+
os.environ.get("AEGIS_CACHE_DIR")
|
| 66 |
+
or os.environ.get("HF_HOME")
|
| 67 |
+
or os.environ.get("XDG_CACHE_HOME")
|
| 68 |
+
)
|
| 69 |
+
if root:
|
| 70 |
+
return Path(root) / "aegis_env"
|
| 71 |
+
repo_root = Path(__file__).resolve().parents[1]
|
| 72 |
+
return repo_root / ".cache" / "aegis_env"
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _unwrap_object_id(v: Any) -> str:
|
| 76 |
+
# Expected schema: {"$oid": "..."}; tolerate already-string ids.
|
| 77 |
+
if isinstance(v, dict) and "$oid" in v:
|
| 78 |
+
return str(v.get("$oid") or "")
|
| 79 |
+
return str(v or "")
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def _unwrap_number(v: Any) -> Optional[float]:
|
| 83 |
+
# Expected schema: number OR {"$numberDouble": "Infinity"/"-Infinity"/"NaN"}.
|
| 84 |
+
if v is None:
|
| 85 |
+
return None
|
| 86 |
+
if isinstance(v, (int, float)):
|
| 87 |
+
return float(v)
|
| 88 |
+
if isinstance(v, dict) and "$numberDouble" in v:
|
| 89 |
+
s = str(v.get("$numberDouble"))
|
| 90 |
+
if s == "Infinity":
|
| 91 |
+
return float("inf")
|
| 92 |
+
if s == "-Infinity":
|
| 93 |
+
return float("-inf")
|
| 94 |
+
if s == "NaN":
|
| 95 |
+
return float("nan")
|
| 96 |
+
try:
|
| 97 |
+
return float(v)
|
| 98 |
+
except Exception:
|
| 99 |
+
return None
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def _reference_feedback_from_record(rec: Dict[str, Any]) -> str:
|
| 103 |
+
# New schema stores feedback under evaluation.agent_feedback.
|
| 104 |
+
ev = rec.get("evaluation") or {}
|
| 105 |
+
agent_feedback = (ev.get("agent_feedback") or {}) if isinstance(ev, dict) else {}
|
| 106 |
+
if isinstance(agent_feedback, dict):
|
| 107 |
+
sj = agent_feedback.get("score_justification")
|
| 108 |
+
ia = agent_feedback.get("improvement_advice")
|
| 109 |
+
joined = " ".join([str(x).strip() for x in [sj, ia] if x is not None]).strip()
|
| 110 |
+
if joined:
|
| 111 |
+
return joined
|
| 112 |
+
# Backward-compat: old field name.
|
| 113 |
+
return str(rec.get("reference_feedback") or "")
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def _download_dataset_json(repo_id: str, filename: str, revision: Optional[str]) -> Path:
|
| 117 |
+
from huggingface_hub import hf_hub_download # type: ignore[import-not-found]
|
| 118 |
+
|
| 119 |
+
cache_dir = _cache_dir()
|
| 120 |
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
| 121 |
+
try:
|
| 122 |
+
downloaded = hf_hub_download(
|
| 123 |
+
repo_id=repo_id,
|
| 124 |
+
filename=filename,
|
| 125 |
+
repo_type="dataset",
|
| 126 |
+
revision=revision,
|
| 127 |
+
cache_dir=str(cache_dir / "hf"),
|
| 128 |
+
)
|
| 129 |
+
except Exception:
|
| 130 |
+
# In some sandboxed environments, network access to Hugging Face may be blocked.
|
| 131 |
+
# If the file is already present in the global HF cache, fall back to it.
|
| 132 |
+
downloaded = hf_hub_download(
|
| 133 |
+
repo_id=repo_id,
|
| 134 |
+
filename=filename,
|
| 135 |
+
repo_type="dataset",
|
| 136 |
+
revision=revision,
|
| 137 |
+
cache_dir=None,
|
| 138 |
+
local_files_only=True,
|
| 139 |
+
)
|
| 140 |
+
stable_path = cache_dir / f"{repo_id.replace('/', '__')}__{filename}"
|
| 141 |
+
try:
|
| 142 |
+
stable_path.write_bytes(Path(downloaded).read_bytes())
|
| 143 |
+
return stable_path
|
| 144 |
+
except Exception:
|
| 145 |
+
return Path(downloaded)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def _load_dataset_records() -> List[Dict[str, Any]]:
|
| 149 |
+
_load_dotenv_if_available()
|
| 150 |
+
|
| 151 |
+
repo_id = os.environ.get("AEGIS_HF_DATASET_REPO") or "NishithP2004/AEGIS-Eval-v2"
|
| 152 |
+
filename = os.environ.get("AEGIS_HF_DATASET_FILE") or "dataset.json"
|
| 153 |
+
revision = os.environ.get("AEGIS_HF_DATASET_REVISION") or None
|
| 154 |
+
offline = str(os.environ.get("AEGIS_HF_OFFLINE") or "").lower() in {"1", "true", "yes"}
|
| 155 |
+
|
| 156 |
+
cache_dir = _cache_dir()
|
| 157 |
+
stable_path = cache_dir / f"{repo_id.replace('/', '__')}__{filename}"
|
| 158 |
+
|
| 159 |
+
path: Optional[Path] = None
|
| 160 |
+
if stable_path.exists():
|
| 161 |
+
path = stable_path
|
| 162 |
+
elif not offline:
|
| 163 |
+
path = _download_dataset_json(repo_id, filename, revision)
|
| 164 |
+
|
| 165 |
+
if path is None or not path.exists():
|
| 166 |
+
raise RuntimeError(
|
| 167 |
+
f"Dataset cache not found. Expected {stable_path}. "
|
| 168 |
+
f"Set AEGIS_HF_OFFLINE=0 to allow download or provide the cached file."
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
data = json.loads(path.read_text(encoding="utf-8"))
|
| 172 |
+
if isinstance(data, dict) and "data" in data and isinstance(data["data"], list):
|
| 173 |
+
records = data["data"]
|
| 174 |
+
elif isinstance(data, list):
|
| 175 |
+
records = data
|
| 176 |
+
else:
|
| 177 |
+
raise RuntimeError(f"Unexpected dataset.json shape in {path}")
|
| 178 |
+
|
| 179 |
+
out: List[Dict[str, Any]] = []
|
| 180 |
+
for rec in records:
|
| 181 |
+
if not isinstance(rec, dict):
|
| 182 |
+
continue
|
| 183 |
+
norm: Dict[str, Any] = dict(rec)
|
| 184 |
+
norm["_id"] = _unwrap_object_id(rec.get("_id"))
|
| 185 |
+
for k in ("max_score", "min_score", "obtained_score"):
|
| 186 |
+
norm[k] = _unwrap_number(rec.get(k))
|
| 187 |
+
out.append(norm)
|
| 188 |
+
return out
|
| 189 |
+
|
| 190 |
+
|
| 191 |
class AegisEnvironment(Environment[AegisAction, AegisObservation, State]):
|
| 192 |
"""
|
| 193 |
Single-step grading episode: reset samples a row; step scores the agent output.
|
|
|
|
| 210 |
self.current_reference_feedback: str = ""
|
| 211 |
self.current_max_score: float = 1.0
|
| 212 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
try:
|
| 214 |
+
self.dataset = _load_dataset_records()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
except Exception as e:
|
| 216 |
+
self._load_error = f"Dataset load failed: {e!s}"
|
| 217 |
self.dataset = []
|
| 218 |
|
| 219 |
def reset(
|
|
|
|
| 259 |
|
| 260 |
# Store the ground truth for the deterministic reward calculation in step()
|
| 261 |
self.current_ground_truth = float(selected_record.get("obtained_score", 0.0))
|
| 262 |
+
self.current_reference_feedback = _reference_feedback_from_record(selected_record)
|
| 263 |
self.current_max_score = float(selected_record.get("max_score", 1.0) or 1.0)
|
| 264 |
|
| 265 |
# Ensure rubrics are handled even if missing (like in ASAP-SAS)
|