Upload rosetta_generator.py with huggingface_hub
Browse files- rosetta_generator.py +211 -0
rosetta_generator.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import csv
|
| 2 |
+
import random
|
| 3 |
+
|
| 4 |
+
# ============================
|
| 5 |
+
# CONFIGURATION
|
| 6 |
+
# ============================
|
| 7 |
+
OUTPUT_FILE = "rosetta_code_dataset.csv"
|
| 8 |
+
SAMPLES_PER_ALGO = 500 # High variation count for better matching
|
| 9 |
+
|
| 10 |
+
# ============================
|
| 11 |
+
# 1. THE ULTIMATE ALGORITHM LIBRARY
|
| 12 |
+
# ============================
|
| 13 |
+
ALGORITHMS = {
|
| 14 |
+
# ---------------------------
|
| 15 |
+
# BASIC MATH & LOGIC
|
| 16 |
+
# ---------------------------
|
| 17 |
+
"factorial": {
|
| 18 |
+
"prompts": ["factorial of a number", "calculate n!", "multiplication of 1 to n", "find factorial", "fact code"],
|
| 19 |
+
"python": "def factorial(n):\n return 1 if n == 0 else n * factorial(n-1)\nnum = int(input())\nprint(factorial(num))",
|
| 20 |
+
"cpp": "#include<iostream>\nusing namespace std;\nint factorial(int n) {\n return (n == 0) ? 1 : n * factorial(n - 1);\n}\nint main() {\n int n; cin>>n;\n cout << factorial(n);\n}",
|
| 21 |
+
"java": "import java.util.Scanner;\nclass Main {\n static int factorial(int n) {\n return (n == 0) ? 1 : n * factorial(n - 1);\n }\n public static void main(String[] args) {\n Scanner sc = new Scanner(System.in);\n System.out.println(factorial(sc.nextInt()));\n }\n}"
|
| 22 |
+
},
|
| 23 |
+
"fibonacci": {
|
| 24 |
+
"prompts": ["fibonacci series", "print fib numbers", "sequence 0 1 1 2 3", "fib series", "fibonacci recursion"],
|
| 25 |
+
"python": "n = int(input())\na, b = 0, 1\nfor _ in range(n):\n print(a, end=' ')\n a, b = b, a+b",
|
| 26 |
+
"cpp": "int n, a=0, b=1, next;\ncin >> n;\nfor (int i = 0; i < n; i++) {\n cout << a << \" \";\n next = a + b;\n a = b;\n b = next;\n}",
|
| 27 |
+
"java": "int n = 10, a = 0, b = 1;\nfor (int i = 0; i < n; i++) {\n System.out.print(a + \" \");\n int next = a + b;\n a = b;\n b = next;\n}"
|
| 28 |
+
},
|
| 29 |
+
"swap_two_numbers": {
|
| 30 |
+
"prompts": ["swap two numbers", "swap variables without temp", "exchange values", "swap logic"],
|
| 31 |
+
"python": "a = int(input())\nb = int(input())\na, b = b, a\nprint(a, b)",
|
| 32 |
+
"cpp": "int a, b;\ncin >> a >> b;\na = a + b;\nb = a - b;\na = a - b;\ncout << a << \" \" << b;",
|
| 33 |
+
"java": "int a = 5, b = 10;\na = a + b;\nb = a - b;\na = a - b;\nSystem.out.println(a + \" \" + b);"
|
| 34 |
+
},
|
| 35 |
+
"leap_year": {
|
| 36 |
+
"prompts": ["check leap year", "is year leap", "leap year logic", "days in february year"],
|
| 37 |
+
"python": "year = int(input())\nif (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0):\n print('Leap Year')\nelse:\n print('Not Leap Year')",
|
| 38 |
+
"cpp": "int year;\ncin >> year;\nif ((year % 4 == 0 && year % 100 != 0) || (year % 400 == 0))\n cout << \"Leap Year\";\nelse\n cout << \"Not Leap Year\";",
|
| 39 |
+
"java": "int year = 2024;\nif ((year % 4 == 0 && year % 100 != 0) || (year % 400 == 0))\n System.out.println(\"Leap Year\");\nelse\n System.out.println(\"Not Leap Year\");"
|
| 40 |
+
},
|
| 41 |
+
"odd_even": {
|
| 42 |
+
"prompts": ["check odd even", "is number divisible by 2", "find even number", "odd number logic"],
|
| 43 |
+
"python": "num = int(input())\nif num % 2 == 0: print('Even')\nelse: print('Odd')",
|
| 44 |
+
"cpp": "int n; cin >> n;\nif(n % 2 == 0) cout << \"Even\";\nelse cout << \"Odd\";",
|
| 45 |
+
"java": "int n = 5;\nif(n % 2 == 0) System.out.println(\"Even\");\nelse System.out.println(\"Odd\");"
|
| 46 |
+
},
|
| 47 |
+
"lcm_hcf": {
|
| 48 |
+
"prompts": ["lcm and hcf", "gcd of two numbers", "least common multiple", "highest common factor"],
|
| 49 |
+
"python": "import math\na, b = 12, 15\ngcd = math.gcd(a, b)\nlcm = (a*b)//gcd\nprint('HCF:', gcd, 'LCM:', lcm)",
|
| 50 |
+
"cpp": "int gcd(int a, int b) { return b==0?a:gcd(b, a%b); }\nint main() {\n int a=12, b=15;\n cout<<\"HCF: \"<<gcd(a,b);\n cout<<\"LCM: \"<<(a*b)/gcd(a,b);\n}",
|
| 51 |
+
"java": "static int gcd(int a, int b) { return b==0?a:gcd(b, a%b); }\npublic static void main(String[] args) {\n int a=12, b=15;\n System.out.println(\"HCF: \"+gcd(a,b));\n System.out.println(\"LCM: \"+(a*b)/gcd(a,b));\n}"
|
| 52 |
+
},
|
| 53 |
+
|
| 54 |
+
# ---------------------------
|
| 55 |
+
# NUMBER THEORY
|
| 56 |
+
# ---------------------------
|
| 57 |
+
"prime_check": {
|
| 58 |
+
"prompts": ["check prime number", "is prime or not", "prime no program", "find if number is prime"],
|
| 59 |
+
"python": "num = int(input())\nif num > 1:\n for i in range(2, int(num**0.5)+1):\n if (num % i) == 0: print('Not Prime'); break\n else: print('Prime')\nelse: print('Not Prime')",
|
| 60 |
+
"cpp": "bool isPrime(int n) {\n if (n <= 1) return false;\n for (int i = 2; i * i <= n; i++)\n if (n % i == 0) return false;\n return true;\n}",
|
| 61 |
+
"java": "boolean isPrime(int n) {\n if (n <= 1) return false;\n for (int i = 2; i * i <= n; i++)\n if (n % i == 0) return false;\n return true;\n}"
|
| 62 |
+
},
|
| 63 |
+
"armstrong": {
|
| 64 |
+
"prompts": ["armstrong number", "sum of cubes of digits", "check armstrong", "narcissistic number"],
|
| 65 |
+
"python": "n = int(input())\nsum = 0\ntemp = n\nwhile temp > 0:\n digit = temp % 10\n sum += digit ** 3\n temp //= 10\nif n == sum: print('Armstrong')\nelse: print('Not Armstrong')",
|
| 66 |
+
"cpp": "int n, r, sum=0, temp;\ncin >> n;\ntemp = n;\nwhile(n>0){r=n%10;sum=sum+(r*r*r);n=n/10;}\nif(temp==sum) cout<<\"Armstrong\";\nelse cout<<\"Not\";",
|
| 67 |
+
"java": "int n=153, r, sum=0, temp;\ntemp = n;\nwhile(n>0){r=n%10;sum=sum+(r*r*r);n=n/10;}\nif(temp==sum) System.out.println(\"Armstrong\");\nelse System.out.println(\"Not\");"
|
| 68 |
+
},
|
| 69 |
+
"palindrome_number": {
|
| 70 |
+
"prompts": ["palindrome number", "reverse number equal", "check number palindrome"],
|
| 71 |
+
"python": "n = input()\nif n == n[::-1]: print('Palindrome')\nelse: print('Not Palindrome')",
|
| 72 |
+
"cpp": "int n, r, sum=0, temp;\ncin >> n;\ntemp = n;\nwhile(n>0){r=n%10;sum=(sum*10)+r;n=n/10;}\nif(temp==sum) cout<<\"Palindrome\";\nelse cout<<\"Not\";",
|
| 73 |
+
"java": "int n=121, r, sum=0, temp;\ntemp = n;\nwhile(n>0){r=n%10;sum=(sum*10)+r;n=n/10;}\nif(temp==sum) System.out.println(\"Palindrome\");\nelse System.out.println(\"Not\");"
|
| 74 |
+
},
|
| 75 |
+
"sum_of_digits": {
|
| 76 |
+
"prompts": ["sum of digits", "add all digits of number", "digit sum logic"],
|
| 77 |
+
"python": "n = int(input())\ns = 0\nwhile n > 0:\n s += n % 10\n n //= 10\nprint(s)",
|
| 78 |
+
"cpp": "int n, sum=0;\ncin >> n;\nwhile(n>0) { sum += n%10; n/=10; }\ncout << sum;",
|
| 79 |
+
"java": "int n=123, sum=0;\nwhile(n>0) { sum += n%10; n/=10; }\nSystem.out.println(sum);"
|
| 80 |
+
},
|
| 81 |
+
"decimal_to_binary": {
|
| 82 |
+
"prompts": ["decimal to binary", "convert dec to bin", "binary of number"],
|
| 83 |
+
"python": "n = int(input())\nprint(bin(n).replace('0b', ''))",
|
| 84 |
+
"cpp": "void decToBinary(int n) {\n int binaryNum[32];\n int i = 0;\n while (n > 0) {\n binaryNum[i] = n % 2;\n n = n / 2;\n i++;\n }\n for (int j = i - 1; j >= 0; j--) cout << binaryNum[j];\n}",
|
| 85 |
+
"java": "void decToBinary(int n) {\n System.out.println(Integer.toBinaryString(n));\n}"
|
| 86 |
+
},
|
| 87 |
+
|
| 88 |
+
# ---------------------------
|
| 89 |
+
# ARRAYS & MATRICES
|
| 90 |
+
# ---------------------------
|
| 91 |
+
"bubble_sort": {
|
| 92 |
+
"prompts": ["bubble sort", "sort array ascending", "sorting algorithm", "arrange elements"],
|
| 93 |
+
"python": "arr = [64, 34, 25, 12, 22, 11, 90]\nn = len(arr)\nfor i in range(n):\n for j in range(0, n-i-1):\n if arr[j] > arr[j+1]: arr[j], arr[j+1] = arr[j+1], arr[j]\nprint(arr)",
|
| 94 |
+
"cpp": "void bubbleSort(int arr[], int n) {\n for (int i = 0; i < n-1; i++)\n for (int j = 0; j < n-i-1; j++)\n if (arr[j] > arr[j+1]) swap(arr[j], arr[j+1]);\n}",
|
| 95 |
+
"java": "void bubbleSort(int arr[]) {\n int n = arr.length;\n for (int i = 0; i < n-1; i++)\n for (int j = 0; j < n-i-1; j++)\n if (arr[j] > arr[j+1]) {\n int temp = arr[j]; arr[j] = arr[j+1]; arr[j+1] = temp;\n }\n}"
|
| 96 |
+
},
|
| 97 |
+
"linear_search": {
|
| 98 |
+
"prompts": ["linear search", "find element in array", "search number list"],
|
| 99 |
+
"python": "arr = [10, 20, 30, 40]\nx = 30\nif x in arr: print('Found')\nelse: print('Not Found')",
|
| 100 |
+
"cpp": "int search(int arr[], int n, int x) {\n for (int i = 0; i < n; i++)\n if (arr[i] == x) return i;\n return -1;\n}",
|
| 101 |
+
"java": "int search(int arr[], int x) {\n for (int i = 0; i < arr.length; i++)\n if (arr[i] == x) return i;\n return -1;\n}"
|
| 102 |
+
},
|
| 103 |
+
"largest_in_array": {
|
| 104 |
+
"prompts": ["largest element in array", "max in array", "find biggest number in list"],
|
| 105 |
+
"python": "arr = [10, 324, 45, 90, 9808]\nprint(max(arr))",
|
| 106 |
+
"cpp": "int largest(int arr[], int n) {\n int max = arr[0];\n for (int i = 1; i < n; i++)\n if (arr[i] > max) max = arr[i];\n return max;\n}",
|
| 107 |
+
"java": "int largest(int arr[]) {\n int max = arr[0];\n for (int i = 1; i < arr.length; i++)\n if (arr[i] > max) max = arr[i];\n return max;\n}"
|
| 108 |
+
},
|
| 109 |
+
"matrix_add": {
|
| 110 |
+
"prompts": ["matrix addition", "add two matrices", "sum of matrix"],
|
| 111 |
+
"python": "X = [[1,2,3], [4 ,5,6], [7 ,8,9]]\nY = [[9,8,7], [6,5,4], [3,2,1]]\nresult = [[X[i][j] + Y[i][j] for j in range(len(X[0]))] for i in range(len(X))]\nfor r in result: print(r)",
|
| 112 |
+
"cpp": "void addMatrix(int A[3][3], int B[3][3]) {\n for(int i=0;i<3;i++) {\n for(int j=0;j<3;j++) cout<<A[i][j]+B[i][j]<<\" \";\n cout<<endl;\n }\n}",
|
| 113 |
+
"java": "void addMatrix(int A[][], int B[][]) {\n for(int i=0;i<3;i++) {\n for(int j=0;j<3;j++) System.out.print(A[i][j]+B[i][j]+\" \");\n System.out.println();\n }\n}"
|
| 114 |
+
},
|
| 115 |
+
"matrix_transpose": {
|
| 116 |
+
"prompts": ["matrix transpose", "transpose of matrix", "swap rows and columns"],
|
| 117 |
+
"python": "X = [[12,7], [4 ,5], [3 ,8]]\nresult = [[X[j][i] for j in range(len(X))] for i in range(len(X[0]))]\nfor r in result: print(r)",
|
| 118 |
+
"cpp": "void transpose(int A[3][3]) {\n for(int i=0;i<3;i++) {\n for(int j=0;j<3;j++) cout<<A[j][i]<<\" \";\n cout<<endl;\n }\n}",
|
| 119 |
+
"java": "void transpose(int A[][]) {\n for(int i=0;i<3;i++) {\n for(int j=0;j<3;j++) System.out.print(A[j][i]+\" \");\n System.out.println();\n }\n}"
|
| 120 |
+
},
|
| 121 |
+
|
| 122 |
+
# ---------------------------
|
| 123 |
+
# STRINGS
|
| 124 |
+
# ---------------------------
|
| 125 |
+
"string_palindrome": {
|
| 126 |
+
"prompts": ["string palindrome", "check word palindrome", "reverse string equal"],
|
| 127 |
+
"python": "s = input()\nif s == s[::-1]: print('Palindrome')\nelse: print('Not Palindrome')",
|
| 128 |
+
"cpp": "string s; cin >> s;\nstring rev = string(s.rbegin(), s.rend());\nif (s == rev) cout << \"Palindrome\";\nelse cout << \"Not\";",
|
| 129 |
+
"java": "String str = \"madam\", rev = \"\";\nfor (int i = str.length() - 1; i >= 0; i--) rev = rev + str.charAt(i);\nif (str.equals(rev)) System.out.println(\"Palindrome\");"
|
| 130 |
+
},
|
| 131 |
+
"vowel_count": {
|
| 132 |
+
"prompts": ["count vowels", "number of vowels in string", "vowel consonant count"],
|
| 133 |
+
"python": "s = input().lower()\ncount = 0\nfor char in s:\n if char in 'aeiou': count += 1\nprint(count)",
|
| 134 |
+
"cpp": "string s; cin >> s;\nint count = 0;\nfor(char c : s) {\n if(c=='a'||c=='e'||c=='i'||c=='o'||c=='u') count++;\n}\ncout << count;",
|
| 135 |
+
"java": "String s = \"hello\";\nint count = 0;\nfor(int i=0; i<s.length(); i++) {\n char c = s.charAt(i);\n if(c=='a'||c=='e'||c=='i'||c=='o'||c=='u') count++;\n}"
|
| 136 |
+
},
|
| 137 |
+
|
| 138 |
+
# ---------------------------
|
| 139 |
+
# PATTERNS (EXAM FAVORITES)
|
| 140 |
+
# ---------------------------
|
| 141 |
+
"star_pyramid": {
|
| 142 |
+
"prompts": ["star pyramid", "triangle star pattern", "print pyramid"],
|
| 143 |
+
"python": "n = 5\nfor i in range(n):\n print(' '*(n-i-1) + '* '*(i+1))",
|
| 144 |
+
"cpp": "int n=5;\nfor(int i=1; i<=n; i++) {\n for(int j=1; j<=n-i; j++) cout<<\" \";\n for(int j=1; j<=i; j++) cout<<\"* \";\n cout<<endl;\n}",
|
| 145 |
+
"java": "int n=5;\nfor(int i=1; i<=n; i++) {\n for(int j=1; j<=n-i; j++) System.out.print(\" \");\n for(int j=1; j<=i; j++) System.out.print(\"* \");\n System.out.println();\n}"
|
| 146 |
+
},
|
| 147 |
+
"right_triangle": {
|
| 148 |
+
"prompts": ["right angle triangle", "star pattern right", "simple star pattern"],
|
| 149 |
+
"python": "n=5\nfor i in range(1, n+1):\n print('* ' * i)",
|
| 150 |
+
"cpp": "for(int i=1; i<=5; i++){\n for(int j=1; j<=i; j++) cout<<\"* \";\n cout<<endl;\n}",
|
| 151 |
+
"java": "for(int i=1; i<=5; i++){\n for(int j=1; j<=i; j++) System.out.print(\"* \");\n System.out.println();\n}"
|
| 152 |
+
},
|
| 153 |
+
|
| 154 |
+
# ---------------------------
|
| 155 |
+
# UTILITY
|
| 156 |
+
# ---------------------------
|
| 157 |
+
"calculator": {
|
| 158 |
+
"prompts": ["simple calculator", "add sub mul div", "switch case calculator", "calc program"],
|
| 159 |
+
"python": "def calc(a, b, op):\n if op == '+': return a + b\n elif op == '-': return a - b\n elif op == '*': return a * b\n elif op == '/': return a / b",
|
| 160 |
+
"cpp": "switch(op) {\n case '+': cout << a+b; break;\n case '-': cout << a-b; break;\n case '*': cout << a*b; break;\n case '/': cout << a/b; break;\n}",
|
| 161 |
+
"java": "switch(op) {\n case '+': System.out.println(a+b); break;\n case '-': System.out.println(a-b); break;\n case '*': System.out.println(a*b); break;\n case '/': System.out.println(a/b); break;\n}"
|
| 162 |
+
},
|
| 163 |
+
"hello_world": {
|
| 164 |
+
"prompts": ["hello world", "print hello", "basic program", "test code"],
|
| 165 |
+
"python": "print(\"Hello World\")",
|
| 166 |
+
"cpp": "#include <iostream>\nusing namespace std;\nint main() {\n cout << \"Hello World\";\n return 0;\n}",
|
| 167 |
+
"java": "public class Main {\n public static void main(String[] args) {\n System.out.println(\"Hello World\");\n }\n}"
|
| 168 |
+
}
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
# ============================
|
| 172 |
+
# 2. GENERATOR LOGIC
|
| 173 |
+
# ============================
|
| 174 |
+
def generate_dataset():
|
| 175 |
+
print("Generating THE ULTIMATE Rosetta Stone Dataset...")
|
| 176 |
+
data = []
|
| 177 |
+
|
| 178 |
+
for algo_key, templates in ALGORITHMS.items():
|
| 179 |
+
base_prompts = templates["prompts"]
|
| 180 |
+
|
| 181 |
+
for _ in range(SAMPLES_PER_ALGO):
|
| 182 |
+
# 1. Randomize Prompt
|
| 183 |
+
prompt_base = random.choice(base_prompts)
|
| 184 |
+
lang = random.choice(["python", "cpp", "java"])
|
| 185 |
+
|
| 186 |
+
# 2. Create natural language variation
|
| 187 |
+
variations = [
|
| 188 |
+
f"{prompt_base} in {lang}",
|
| 189 |
+
f"write {lang} code for {prompt_base}",
|
| 190 |
+
f"how to {prompt_base} using {lang}",
|
| 191 |
+
f"program for {prompt_base} in {lang}",
|
| 192 |
+
f"give me {prompt_base} code {lang}"
|
| 193 |
+
]
|
| 194 |
+
query = random.choice(variations)
|
| 195 |
+
|
| 196 |
+
# 3. Get Code
|
| 197 |
+
code = templates[lang]
|
| 198 |
+
|
| 199 |
+
data.append([query, lang, code])
|
| 200 |
+
|
| 201 |
+
# Save
|
| 202 |
+
with open(OUTPUT_FILE, 'w', newline='', encoding='utf-8') as f:
|
| 203 |
+
writer = csv.writer(f)
|
| 204 |
+
writer.writerow(["prompt", "language", "code"])
|
| 205 |
+
writer.writerows(data)
|
| 206 |
+
|
| 207 |
+
print(f"✅ Created {len(data)} training samples covering {len(ALGORITHMS)} major topics.")
|
| 208 |
+
print(f"Saved to {OUTPUT_FILE}")
|
| 209 |
+
|
| 210 |
+
if __name__ == "__main__":
|
| 211 |
+
generate_dataset()
|