#!/usr/bin/env python3 """ .sema CLI Builder v1.0 Converts any file into a self-describing .sema archive. No API. No internet. Pure local intelligence. Authors: Larbi + Claude | TREN Studio License: MIT """ import os import sys import json import uuid import zipfile import hashlib import argparse import re from datetime import datetime, timezone from pathlib import Path from collections import Counter # ───────────────────────────────────────────── # CONTENT EXTRACTORS # ───────────────────────────────────────────── def extract_text_from_pdf(path): try: import fitz doc = fitz.open(path) text = "" for page in doc: text += page.get_text() doc.close() return text.strip() except Exception as e: return f"[PDF extraction error: {e}]" def extract_text_from_docx(path): try: from docx import Document doc = Document(path) return "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) except Exception as e: return f"[DOCX extraction error: {e}]" def extract_text_from_txt(path): try: import chardet with open(path, 'rb') as f: raw = f.read() enc = chardet.detect(raw).get('encoding', 'utf-8') or 'utf-8' return raw.decode(enc, errors='replace') except Exception as e: return f"[TXT extraction error: {e}]" def extract_text_from_image(path): """Extract human-readable description from image — NOT raw JSON.""" try: from PIL import Image img = Image.open(path) fname = Path(path).name stem = Path(path).stem.replace('_', ' ').replace('-', ' ') # EXIF data exif_data = {} camera = "" date_taken = "" try: exif = img._getexif() if exif: from PIL.ExifTags import TAGS for tag_id, value in exif.items(): tag = TAGS.get(tag_id, tag_id) if isinstance(value, (str, int, float)): exif_data[str(tag)] = str(value) make = exif_data.get("Make", "").strip() model = exif_data.get("Model", "").strip() if make or model: camera = f"{make} {model}".strip() date_taken = exif_data.get("DateTimeOriginal", "") or exif_data.get("DateTime", "") except: pass # Size info try: size_kb = round(Path(path).stat().st_size / 1024, 1) except: size_kb = 0 orientation = "landscape" if img.width > img.height else ("portrait" if img.height > img.width else "square") mp = round((img.width * img.height) / 1_000_000, 1) # Build HUMAN-READABLE description (this is what the brain uses) lines = [ f"This is a {img.format} image file named '{fname}'.", f"Dimensions: {img.width} x {img.height} pixels ({mp} megapixels), {orientation} orientation.", f"Color mode: {img.mode}. File size: {size_kb} KB.", ] if camera: lines.append(f"Captured with: {camera}.") if date_taken: lines.append(f"Date taken: {date_taken}.") if stem and len(stem) > 2: lines.append(f"The file name suggests it may be related to: {stem}.") # Store raw metadata at end (NOT used as summary) raw = json.dumps({"format": img.format, "mode": img.mode, "width": img.width, "height": img.height, "exif": exif_data}) lines.append(f"[RAW_META]{raw}[/RAW_META]") return "\n".join(lines) except Exception as e: return f"[Image extraction error: {e}]" def extract_text_from_xlsx(path): try: import openpyxl wb = openpyxl.load_workbook(path, read_only=True, data_only=True) texts = [] for ws in wb.worksheets: texts.append(f"Sheet: {ws.title}") for row in ws.iter_rows(max_row=200, values_only=True): row_text = " | ".join(str(c) for c in row if c is not None) if row_text.strip(): texts.append(row_text) return "\n".join(texts) except Exception as e: return f"[XLSX extraction error: {e}]" def extract_text(file_path): ext = Path(file_path).suffix.lower() if ext == '.pdf': return extract_text_from_pdf(file_path) elif ext in ['.docx']: return extract_text_from_docx(file_path) elif ext in ['.txt', '.md', '.csv', '.json', '.html', '.xml', '.py', '.js']: return extract_text_from_txt(file_path) elif ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff']: return extract_text_from_image(file_path) elif ext in ['.xlsx', '.xls']: return extract_text_from_xlsx(file_path) else: return f"[No text extractor for {ext}]" # ───────────────────────────────────────────── # LOCAL BRAIN ENGINE (Zero API) # ───────────────────────────────────────────── # Arabic stop words STOPWORDS_AR = set([ "في","من","إلى","على","عن","مع","هذا","هذه","التي","الذي","كان","كانت", "أن","إن","وأن","ولا","وهو","وهي","هو","هي","أو","لا","ما","كل","قد", "لم","لن","ثم","حتى","بعد","قبل","عند","منذ","خلال","بين","بعض","كيف", "لقد","إذا","لكن","ولكن","حين","أيضاً","أيضا","فقط","وقد","وكان","وهذا" ]) # English stop words STOPWORDS_EN = set([ "the","a","an","and","or","but","in","on","at","to","for","of","with", "by","from","is","are","was","were","be","been","have","has","had", "do","does","did","will","would","could","should","may","might","it", "its","this","that","these","those","he","she","they","we","you","i", "not","no","nor","so","yet","both","either","just","than","then","when", "where","who","which","what","how","if","as","up","out","about","into" ]) def clean_and_tokenize(text): """Extract meaningful tokens from text.""" text = re.sub(r'[^\w\s\u0600-\u06FF]', ' ', text) tokens = text.lower().split() tokens = [t for t in tokens if len(t) > 3] tokens = [t for t in tokens if t not in STOPWORDS_EN and t not in STOPWORDS_AR] return tokens def extract_keywords(text, top_n=20): """Extract top keywords by frequency.""" tokens = clean_and_tokenize(text) freq = Counter(tokens) return [word for word, _ in freq.most_common(top_n)] def detect_language(text): """Simple language detection.""" arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text)) latin_chars = len(re.findall(r'[a-zA-Z]', text)) french_indicators = ['le ', 'la ', 'les ', 'de ', 'du ', 'des ', 'et ', 'est ', 'une ', 'un '] french_count = sum(1 for f in french_indicators if f in text.lower()) if arabic_chars > latin_chars: return "ar" elif french_count > 3: return "fr" else: return "en" def generate_summary(text, sentences=3): """Generate human-readable summary — never echo raw JSON or repeat the file.""" # Strip raw meta tags from images before summarizing clean_text = re.sub(r'\[RAW_META\].*?\[/RAW_META\]', '', text, flags=re.DOTALL).strip() # Split into sentences sent_pattern = r'(?<=[.!?؟])\s+|(?<=[.!?؟])$' sentences_list = re.split(sent_pattern, clean_text.strip()) sentences_list = [s.strip() for s in sentences_list if len(s.strip()) > 20 and not s.strip().startswith('{') and not s.strip().startswith('[')] if not sentences_list or len(clean_text.strip()) < 80: # Micro-file: generate a smart description, never echo words = clean_text.split()[:20] snippet = ' '.join(words) char_count = len(clean_text) return (f"This is a compact file ({char_count} characters) containing brief or specialized content. " f"It appears to be a short note, configuration snippet, or reference document. " f"Key excerpt: '{snippet}...'") # Score sentences by keyword density all_tokens = clean_and_tokenize(text) freq = Counter(all_tokens) def score_sentence(s): tokens = clean_and_tokenize(s) return sum(freq.get(t, 0) for t in tokens) / (len(tokens) + 1) scored = sorted(sentences_list, key=score_sentence, reverse=True) top = scored[:sentences] # Re-order by original position result = [] for s in sentences_list: if s in top: result.append(s) if len(result) == sentences: break return ' '.join(result) if result else sentences_list[0][:200] + '...' def detect_content_type(file_path, text): """Guess content type from extension and content.""" ext = Path(file_path).suffix.lower() fname = Path(file_path).name.lower() text_lower = (text or '').lower() # By extension if ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']: return "image/photo" if ext in ['.xlsx', '.xls', '.csv']: return "data/spreadsheet" if ext in ['.mp3', '.wav', '.ogg', '.m4a']: return "media/audio" if ext in ['.mp4', '.avi', '.mov', '.mkv']: return "media/video" # By content keywords recipe_words = ['ingredients', 'recipe', 'tablespoon', 'cup', 'cook', 'bake', 'وصفة', 'مقادير', 'طريقة', 'دقيقة', 'غرام', 'recette', 'ingrédients'] invoice_words = ['invoice', 'total', 'amount', 'payment', 'due', 'فاتورة', 'مبلغ'] contract_words = ['agreement', 'contract', 'party', 'clause', 'عقد', 'اتفاقية'] recipe_score = sum(1 for w in recipe_words if w in text_lower) invoice_score = sum(1 for w in invoice_words if w in text_lower) contract_score = sum(1 for w in contract_words if w in text_lower) if recipe_score >= 3: return "document/recipe" if invoice_score >= 2: return "document/invoice" if contract_score >= 2: return "document/contract" return "document/generic" def extract_entities(text): """Simple named entity detection using patterns.""" entities = {"people": [], "places": [], "concepts": [], "dates": []} # Dates date_patterns = [ r'\b\d{4}-\d{2}-\d{2}\b', r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b', ] for pattern in date_patterns: found = re.findall(pattern, text, re.IGNORECASE) entities["dates"].extend(found[:5]) # Places (capitalized words after location prepositions) place_patterns = [ r'(?:in|at|from|to|near|المغرب|مراكش|الرباط|فاس|طنجة|Morocco|Marrakech|Rabat|Fes)\s+([A-Z][a-z]+)', ] for pattern in place_patterns: found = re.findall(pattern, text) entities["places"].extend(found[:5]) # Remove duplicates for k in entities: entities[k] = list(dict.fromkeys(entities[k]))[:5] return entities def generate_content_data(content_type, text): """Generate content-type-specific structured data.""" data = {} if content_type == "document/recipe": # Extract ingredients hints lines = text.split('\n') ingredient_lines = [] for line in lines: if any(unit in line.lower() for unit in ['cup', 'tbsp', 'tsp', 'gram', 'kg', 'g ', 'كوب', 'ملعقة', 'غرام', 'كيلو']): ingredient_lines.append(line.strip()) data["ingredients_hints"] = ingredient_lines[:10] # Try to find cooking time time_match = re.search(r'(\d+)\s*(?:minutes?|mins?|دقيقة|دقائق)', text, re.IGNORECASE) if time_match: data["cook_time_minutes"] = int(time_match.group(1)) word_count = len(text.split()) data["estimated_steps"] = max(1, word_count // 80) elif content_type == "image/photo": try: img_data = json.loads(text) data = img_data except: data["note"] = "Image file" elif content_type == "data/spreadsheet": lines = text.split('\n') data["estimated_rows"] = len([l for l in lines if '|' in l]) data["sheets"] = [l.replace('Sheet:', '').strip() for l in lines if l.startswith('Sheet:')] elif content_type == "document/generic": words = text.split() data["word_count"] = len(words) data["reading_time_minutes"] = max(1, len(words) // 200) headings = re.findall(r'^#{1,3}\s+(.+)$', text, re.MULTILINE) data["headings"] = headings[:10] return data def generate_questions(text, content_type, summary, keywords): """Pre-generate meaningful Q&A — never raw JSON answers.""" # Strip raw meta before using text for Q&A clean_text = re.sub(r'\[RAW_META\].*?\[/RAW_META\]', '', text, flags=re.DOTALL).strip() questions = [ {"q": "What is this file about?", "a": summary}, {"q": "What is this?", "a": summary}, {"q": "Summarize this file.", "a": summary}, {"q": "What are the main topics?", "a": ", ".join(keywords[:8]) if keywords else "Not enough content to extract topics."}, ] if content_type == "document/recipe": questions.extend([ {"q": "What kind of file is this?", "a": "This is a recipe document containing cooking instructions and ingredients."}, {"q": "ما هذا الملف؟", "a": "هذا ملف وصفة طبخ يحتوي على مكونات وطريقة التحضير."}, ]) elif content_type == "image/photo": # Parse embedded human-readable lines (NOT raw JSON) img_lines = [l for l in clean_text.split('\n') if l.strip() and not l.startswith('[')] img_desc = ' '.join(img_lines[:4]) if img_lines else summary # Extract dimensions from clean text dim_match = re.search(r'(\d+)\s*x\s*(\d+)\s*pixels', clean_text) dims = f"{dim_match.group(1)}×{dim_match.group(2)} pixels" if dim_match else "Unknown dimensions" orient_match = re.search(r'(landscape|portrait|square)', clean_text, re.IGNORECASE) orient = orient_match.group(1) if orient_match else "" fmt_match = re.search(r'This is a (\w+) image', clean_text) fmt = fmt_match.group(1) if fmt_match else "image" questions.extend([ {"q": "What type of image is this?", "a": f"This is a {fmt} image, {dims}{(', ' + orient + ' orientation') if orient else ''}."}, {"q": "What are the image dimensions?", "a": dims}, {"q": "Describe this image.", "a": img_desc}, {"q": "What is the file format?", "a": f"{fmt} format — a standard digital image format."}, ]) elif content_type == "document/invoice": questions.append({ "q": "What type of document is this?", "a": "This is a financial invoice or billing document." }) elif content_type == "data/spreadsheet": questions.append({ "q": "What type of file is this?", "a": "This is a spreadsheet or data table, likely containing rows and columns of structured data." }) # Language-specific lang = detect_language(clean_text) if lang == "ar": questions.append({"q": "ما هي الكلمات الرئيسية؟", "a": "، ".join(keywords[:8])}) questions.append({"q": "ما هذا الملف؟", "a": summary}) elif lang == "fr": questions.append({"q": "De quoi parle ce fichier?", "a": summary[:200]}) return questions # ───────────────────────────────────────────── # VIEW.HTML GENERATOR # ───────────────────────────────────────────── def generate_view_html(manifest, brain, original_filename): """Generate a self-contained viewer HTML.""" title = manifest.get("title", original_filename) author = manifest.get("author", {}).get("name", "Unknown") org = manifest.get("author", {}).get("org", "") content_type = manifest.get("content_type", "document/generic") created = manifest.get("created_at", "")[:10] summary = brain.get("summary", "") keywords = brain.get("keywords", [])[:12] questions = brain.get("questions", []) lang = manifest.get("lang", "en") sema_id = manifest.get("id", "") # Serialize Q&A for JS qa_json = json.dumps(questions, ensure_ascii=False) keywords_json = json.dumps(keywords, ensure_ascii=False) html = f'''