#!/usr/bin/env python3 """ .sema CLI Builder v1.0 Converts any file into a self-describing .sema archive. No API. No internet. Pure local intelligence. Authors: Larbi + Claude | TREN Studio License: MIT """ import os import sys import json import uuid import zipfile import hashlib import argparse import re from datetime import datetime, timezone from pathlib import Path from collections import Counter # ───────────────────────────────────────────── # CONTENT EXTRACTORS # ───────────────────────────────────────────── def extract_text_from_pdf(path): try: import fitz doc = fitz.open(path) text = "" for page in doc: text += page.get_text() doc.close() return text.strip() except Exception as e: return f"[PDF extraction error: {e}]" def extract_text_from_docx(path): try: from docx import Document doc = Document(path) return "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) except Exception as e: return f"[DOCX extraction error: {e}]" def extract_text_from_txt(path): try: import chardet with open(path, 'rb') as f: raw = f.read() enc = chardet.detect(raw).get('encoding', 'utf-8') or 'utf-8' return raw.decode(enc, errors='replace') except Exception as e: return f"[TXT extraction error: {e}]" def extract_text_from_image(path): """Extract human-readable description from image — NOT raw JSON.""" try: from PIL import Image img = Image.open(path) fname = Path(path).name stem = Path(path).stem.replace('_', ' ').replace('-', ' ') # EXIF data exif_data = {} camera = "" date_taken = "" try: exif = img._getexif() if exif: from PIL.ExifTags import TAGS for tag_id, value in exif.items(): tag = TAGS.get(tag_id, tag_id) if isinstance(value, (str, int, float)): exif_data[str(tag)] = str(value) make = exif_data.get("Make", "").strip() model = exif_data.get("Model", "").strip() if make or model: camera = f"{make} {model}".strip() date_taken = exif_data.get("DateTimeOriginal", "") or exif_data.get("DateTime", "") except: pass # Size info try: size_kb = round(Path(path).stat().st_size / 1024, 1) except: size_kb = 0 orientation = "landscape" if img.width > img.height else ("portrait" if img.height > img.width else "square") mp = round((img.width * img.height) / 1_000_000, 1) # Build HUMAN-READABLE description (this is what the brain uses) lines = [ f"This is a {img.format} image file named '{fname}'.", f"Dimensions: {img.width} x {img.height} pixels ({mp} megapixels), {orientation} orientation.", f"Color mode: {img.mode}. File size: {size_kb} KB.", ] if camera: lines.append(f"Captured with: {camera}.") if date_taken: lines.append(f"Date taken: {date_taken}.") if stem and len(stem) > 2: lines.append(f"The file name suggests it may be related to: {stem}.") # Store raw metadata at end (NOT used as summary) raw = json.dumps({"format": img.format, "mode": img.mode, "width": img.width, "height": img.height, "exif": exif_data}) lines.append(f"[RAW_META]{raw}[/RAW_META]") return "\n".join(lines) except Exception as e: return f"[Image extraction error: {e}]" def extract_text_from_xlsx(path): try: import openpyxl wb = openpyxl.load_workbook(path, read_only=True, data_only=True) texts = [] for ws in wb.worksheets: texts.append(f"Sheet: {ws.title}") for row in ws.iter_rows(max_row=200, values_only=True): row_text = " | ".join(str(c) for c in row if c is not None) if row_text.strip(): texts.append(row_text) return "\n".join(texts) except Exception as e: return f"[XLSX extraction error: {e}]" def extract_text(file_path): ext = Path(file_path).suffix.lower() if ext == '.pdf': return extract_text_from_pdf(file_path) elif ext in ['.docx']: return extract_text_from_docx(file_path) elif ext in ['.txt', '.md', '.csv', '.json', '.html', '.xml', '.py', '.js']: return extract_text_from_txt(file_path) elif ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff']: return extract_text_from_image(file_path) elif ext in ['.xlsx', '.xls']: return extract_text_from_xlsx(file_path) else: return f"[No text extractor for {ext}]" # ───────────────────────────────────────────── # LOCAL BRAIN ENGINE (Zero API) # ───────────────────────────────────────────── # Arabic stop words STOPWORDS_AR = set([ "في","من","إلى","على","عن","مع","هذا","هذه","التي","الذي","كان","كانت", "أن","إن","وأن","ولا","وهو","وهي","هو","هي","أو","لا","ما","كل","قد", "لم","لن","ثم","حتى","بعد","قبل","عند","منذ","خلال","بين","بعض","كيف", "لقد","إذا","لكن","ولكن","حين","أيضاً","أيضا","فقط","وقد","وكان","وهذا" ]) # English stop words STOPWORDS_EN = set([ "the","a","an","and","or","but","in","on","at","to","for","of","with", "by","from","is","are","was","were","be","been","have","has","had", "do","does","did","will","would","could","should","may","might","it", "its","this","that","these","those","he","she","they","we","you","i", "not","no","nor","so","yet","both","either","just","than","then","when", "where","who","which","what","how","if","as","up","out","about","into" ]) def clean_and_tokenize(text): """Extract meaningful tokens from text.""" text = re.sub(r'[^\w\s\u0600-\u06FF]', ' ', text) tokens = text.lower().split() tokens = [t for t in tokens if len(t) > 3] tokens = [t for t in tokens if t not in STOPWORDS_EN and t not in STOPWORDS_AR] return tokens def extract_keywords(text, top_n=20): """Extract top keywords by frequency.""" tokens = clean_and_tokenize(text) freq = Counter(tokens) return [word for word, _ in freq.most_common(top_n)] def detect_language(text): """Simple language detection.""" arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text)) latin_chars = len(re.findall(r'[a-zA-Z]', text)) french_indicators = ['le ', 'la ', 'les ', 'de ', 'du ', 'des ', 'et ', 'est ', 'une ', 'un '] french_count = sum(1 for f in french_indicators if f in text.lower()) if arabic_chars > latin_chars: return "ar" elif french_count > 3: return "fr" else: return "en" def generate_summary(text, sentences=3): """Generate human-readable summary — never echo raw JSON or repeat the file.""" # Strip raw meta tags from images before summarizing clean_text = re.sub(r'\[RAW_META\].*?\[/RAW_META\]', '', text, flags=re.DOTALL).strip() # Split into sentences sent_pattern = r'(?<=[.!?؟])\s+|(?<=[.!?؟])$' sentences_list = re.split(sent_pattern, clean_text.strip()) sentences_list = [s.strip() for s in sentences_list if len(s.strip()) > 20 and not s.strip().startswith('{') and not s.strip().startswith('[')] if not sentences_list or len(clean_text.strip()) < 80: # Micro-file: generate a smart description, never echo words = clean_text.split()[:20] snippet = ' '.join(words) char_count = len(clean_text) return (f"This is a compact file ({char_count} characters) containing brief or specialized content. " f"It appears to be a short note, configuration snippet, or reference document. " f"Key excerpt: '{snippet}...'") # Score sentences by keyword density all_tokens = clean_and_tokenize(text) freq = Counter(all_tokens) def score_sentence(s): tokens = clean_and_tokenize(s) return sum(freq.get(t, 0) for t in tokens) / (len(tokens) + 1) scored = sorted(sentences_list, key=score_sentence, reverse=True) top = scored[:sentences] # Re-order by original position result = [] for s in sentences_list: if s in top: result.append(s) if len(result) == sentences: break return ' '.join(result) if result else sentences_list[0][:200] + '...' def detect_content_type(file_path, text): """Guess content type from extension and content.""" ext = Path(file_path).suffix.lower() fname = Path(file_path).name.lower() text_lower = (text or '').lower() # By extension if ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']: return "image/photo" if ext in ['.xlsx', '.xls', '.csv']: return "data/spreadsheet" if ext in ['.mp3', '.wav', '.ogg', '.m4a']: return "media/audio" if ext in ['.mp4', '.avi', '.mov', '.mkv']: return "media/video" # By content keywords recipe_words = ['ingredients', 'recipe', 'tablespoon', 'cup', 'cook', 'bake', 'وصفة', 'مقادير', 'طريقة', 'دقيقة', 'غرام', 'recette', 'ingrédients'] invoice_words = ['invoice', 'total', 'amount', 'payment', 'due', 'فاتورة', 'مبلغ'] contract_words = ['agreement', 'contract', 'party', 'clause', 'عقد', 'اتفاقية'] recipe_score = sum(1 for w in recipe_words if w in text_lower) invoice_score = sum(1 for w in invoice_words if w in text_lower) contract_score = sum(1 for w in contract_words if w in text_lower) if recipe_score >= 3: return "document/recipe" if invoice_score >= 2: return "document/invoice" if contract_score >= 2: return "document/contract" return "document/generic" def extract_entities(text): """Simple named entity detection using patterns.""" entities = {"people": [], "places": [], "concepts": [], "dates": []} # Dates date_patterns = [ r'\b\d{4}-\d{2}-\d{2}\b', r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b', ] for pattern in date_patterns: found = re.findall(pattern, text, re.IGNORECASE) entities["dates"].extend(found[:5]) # Places (capitalized words after location prepositions) place_patterns = [ r'(?:in|at|from|to|near|المغرب|مراكش|الرباط|فاس|طنجة|Morocco|Marrakech|Rabat|Fes)\s+([A-Z][a-z]+)', ] for pattern in place_patterns: found = re.findall(pattern, text) entities["places"].extend(found[:5]) # Remove duplicates for k in entities: entities[k] = list(dict.fromkeys(entities[k]))[:5] return entities def generate_content_data(content_type, text): """Generate content-type-specific structured data.""" data = {} if content_type == "document/recipe": # Extract ingredients hints lines = text.split('\n') ingredient_lines = [] for line in lines: if any(unit in line.lower() for unit in ['cup', 'tbsp', 'tsp', 'gram', 'kg', 'g ', 'كوب', 'ملعقة', 'غرام', 'كيلو']): ingredient_lines.append(line.strip()) data["ingredients_hints"] = ingredient_lines[:10] # Try to find cooking time time_match = re.search(r'(\d+)\s*(?:minutes?|mins?|دقيقة|دقائق)', text, re.IGNORECASE) if time_match: data["cook_time_minutes"] = int(time_match.group(1)) word_count = len(text.split()) data["estimated_steps"] = max(1, word_count // 80) elif content_type == "image/photo": try: img_data = json.loads(text) data = img_data except: data["note"] = "Image file" elif content_type == "data/spreadsheet": lines = text.split('\n') data["estimated_rows"] = len([l for l in lines if '|' in l]) data["sheets"] = [l.replace('Sheet:', '').strip() for l in lines if l.startswith('Sheet:')] elif content_type == "document/generic": words = text.split() data["word_count"] = len(words) data["reading_time_minutes"] = max(1, len(words) // 200) headings = re.findall(r'^#{1,3}\s+(.+)$', text, re.MULTILINE) data["headings"] = headings[:10] return data def generate_questions(text, content_type, summary, keywords): """Pre-generate meaningful Q&A — never raw JSON answers.""" # Strip raw meta before using text for Q&A clean_text = re.sub(r'\[RAW_META\].*?\[/RAW_META\]', '', text, flags=re.DOTALL).strip() questions = [ {"q": "What is this file about?", "a": summary}, {"q": "What is this?", "a": summary}, {"q": "Summarize this file.", "a": summary}, {"q": "What are the main topics?", "a": ", ".join(keywords[:8]) if keywords else "Not enough content to extract topics."}, ] if content_type == "document/recipe": questions.extend([ {"q": "What kind of file is this?", "a": "This is a recipe document containing cooking instructions and ingredients."}, {"q": "ما هذا الملف؟", "a": "هذا ملف وصفة طبخ يحتوي على مكونات وطريقة التحضير."}, ]) elif content_type == "image/photo": # Parse embedded human-readable lines (NOT raw JSON) img_lines = [l for l in clean_text.split('\n') if l.strip() and not l.startswith('[')] img_desc = ' '.join(img_lines[:4]) if img_lines else summary # Extract dimensions from clean text dim_match = re.search(r'(\d+)\s*x\s*(\d+)\s*pixels', clean_text) dims = f"{dim_match.group(1)}×{dim_match.group(2)} pixels" if dim_match else "Unknown dimensions" orient_match = re.search(r'(landscape|portrait|square)', clean_text, re.IGNORECASE) orient = orient_match.group(1) if orient_match else "" fmt_match = re.search(r'This is a (\w+) image', clean_text) fmt = fmt_match.group(1) if fmt_match else "image" questions.extend([ {"q": "What type of image is this?", "a": f"This is a {fmt} image, {dims}{(', ' + orient + ' orientation') if orient else ''}."}, {"q": "What are the image dimensions?", "a": dims}, {"q": "Describe this image.", "a": img_desc}, {"q": "What is the file format?", "a": f"{fmt} format — a standard digital image format."}, ]) elif content_type == "document/invoice": questions.append({ "q": "What type of document is this?", "a": "This is a financial invoice or billing document." }) elif content_type == "data/spreadsheet": questions.append({ "q": "What type of file is this?", "a": "This is a spreadsheet or data table, likely containing rows and columns of structured data." }) # Language-specific lang = detect_language(clean_text) if lang == "ar": questions.append({"q": "ما هي الكلمات الرئيسية؟", "a": "، ".join(keywords[:8])}) questions.append({"q": "ما هذا الملف؟", "a": summary}) elif lang == "fr": questions.append({"q": "De quoi parle ce fichier?", "a": summary[:200]}) return questions # ───────────────────────────────────────────── # VIEW.HTML GENERATOR # ───────────────────────────────────────────── def generate_view_html(manifest, brain, original_filename): """Generate a self-contained viewer HTML.""" title = manifest.get("title", original_filename) author = manifest.get("author", {}).get("name", "Unknown") org = manifest.get("author", {}).get("org", "") content_type = manifest.get("content_type", "document/generic") created = manifest.get("created_at", "")[:10] summary = brain.get("summary", "") keywords = brain.get("keywords", [])[:12] questions = brain.get("questions", []) lang = manifest.get("lang", "en") sema_id = manifest.get("id", "") # Serialize Q&A for JS qa_json = json.dumps(questions, ensure_ascii=False) keywords_json = json.dumps(keywords, ensure_ascii=False) html = f''' {title} — .sema
{title}
{content_type}   By {author}{(' · ' + org) if org else ''}  ·  {created}
.sema v1.0
📋 Summary
{summary}
💬 Ask This File
📌 Metadata
File{original_filename}
Type{content_type}
Created{created}
Author{author}
Language{lang}
Sema ID{sema_id[:20]}...
📄 Content Preview
Loading preview...
''' return html # ───────────────────────────────────────────── # MAIN BUILDER # ───────────────────────────────────────────── def build_sema(input_path, output_path=None, author_name="Unknown", author_org="", title=None, verbose=True): """Main function: converts a file to .sema format.""" input_path = Path(input_path) if not input_path.exists(): print(f"[ERROR] File not found: {input_path}") return False if not output_path: output_path = input_path.with_suffix('.sema') output_path = Path(output_path) if verbose: print(f"\n{'─'*50}") print(f" .sema Builder v1.0 — TREN Studio") print(f"{'─'*50}") print(f" Input: {input_path.name}") print(f" Output: {output_path.name}") print(f"{'─'*50}\n") # 1. Compute checksum if verbose: print(" [1/6] Computing checksum...") sha256 = hashlib.sha256() with open(input_path, 'rb') as f: for chunk in iter(lambda: f.read(65536), b''): sha256.update(chunk) checksum = sha256.hexdigest() # 2. Extract text if verbose: print(" [2/6] Extracting content...") text = extract_text(input_path) # 3. Analyze content if verbose: print(" [3/6] Analyzing semantics...") # Use clean text (without RAW_META tags) for NLP tasks clean_text = re.sub(r'\[RAW_META\].*?\[/RAW_META\]', '', text, flags=re.DOTALL).strip() lang = detect_language(clean_text) content_type = detect_content_type(input_path, clean_text) keywords = extract_keywords(clean_text, top_n=20) summary = generate_summary(text, sentences=3) # pass full text so summary fn can strip entities = extract_entities(clean_text) content_data = generate_content_data(content_type, clean_text) questions = generate_questions(text, content_type, summary, keywords) # 4. Build sema.json if verbose: print(" [4/6] Building manifest...") file_title = title or input_path.stem.replace('-', ' ').replace('_', ' ').title() sema_id = f"sema_{uuid.uuid4()}" created_at = datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z') # Detect MIME type mime_map = { '.pdf': 'application/pdf', '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png', '.txt': 'text/plain', '.md': 'text/markdown', '.html': 'text/html', '.mp3': 'audio/mpeg', '.mp4': 'video/mp4', } mime_type = mime_map.get(input_path.suffix.lower(), 'application/octet-stream') manifest = { "sema_version": "1.0.0", "id": sema_id, "created_at": created_at, "content_type": content_type, "mime_type": mime_type, "filename": input_path.name, "lang": lang, "title": file_title, "description": summary[:200] if summary else "", "tags": keywords[:8], "author": { "name": author_name, "org": author_org, "contact": "" }, "checksum": { "algo": "sha256", "value": checksum }, "content_size_bytes": input_path.stat().st_size, "expires_at": None, "geo": {}, "relations": [], "custom": {} } # 5. Build brain.json brain = { "brain_version": "1.0", "generated_at": created_at, "generator": "sema-builder-cli/1.0-local", "summary": summary, "keywords": keywords, "entities": entities, "topics": keywords[:6], "content_data": content_data, "search_text": text[:10000], "questions": questions, "alt_text": summary[:150] if summary else file_title, "translations": {} } # 6. Generate view.html if verbose: print(" [5/6] Generating viewer...") view_html = generate_view_html(manifest, brain, input_path.name) # 7. Package everything into ZIP (.sema) if verbose: print(" [6/6] Packaging .sema archive...") with zipfile.ZipFile(output_path, 'w', compression=zipfile.ZIP_DEFLATED, compresslevel=6) as zf: zf.writestr('sema.json', json.dumps(manifest, ensure_ascii=False, indent=2)) zf.writestr('brain.json', json.dumps(brain, ensure_ascii=False, indent=2)) zf.writestr('view.html', view_html) zf.write(input_path, f'content/{input_path.name}') if verbose: size_kb = output_path.stat().st_size / 1024 print(f"\n{'─'*50}") print(f" ✓ Done! Output: {output_path.name} ({size_kb:.1f} KB)") print(f" ✓ Content type: {content_type}") print(f" ✓ Language: {lang}") print(f" ✓ Keywords: {', '.join(keywords[:5])}...") print(f" ✓ Q&A pairs: {len(questions)}") print(f"{'─'*50}\n") print(f" Open view.html from the .sema archive in any browser.") print(f" (Rename .sema to .zip and extract to see all layers)\n") return str(output_path) # ───────────────────────────────────────────── # CLI ENTRY POINT # ───────────────────────────────────────────── def main(): parser = argparse.ArgumentParser( description='.sema Builder — Convert any file to a self-describing semantic archive', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python sema_builder.py recipe.pdf python sema_builder.py photo.jpg -a "Larbi" -o "TREN Studio" python sema_builder.py document.docx --title "My Document" --output custom.sema """ ) parser.add_argument('input', help='Input file to convert') parser.add_argument('-o', '--output', help='Output .sema file path (optional)') parser.add_argument('-a', '--author', default='Unknown', help='Author name') parser.add_argument('--org', default='', help='Organization name') parser.add_argument('--title', default=None, help='File title override') parser.add_argument('--quiet', action='store_true', help='Suppress output') args = parser.parse_args() result = build_sema( input_path=args.input, output_path=args.output, author_name=args.author, author_org=args.org, title=args.title, verbose=not args.quiet ) sys.exit(0 if result else 1) if __name__ == '__main__': main()