Document Operations Handbook

Unified guidance for working with structured documents (DOCX, XLSX, PDF) in SEA-Forge™.


Table of Contents


Overview

SEA-Forge™ provides structured document capabilities for:

All operations preserve semantic structure and support traceability to Knowledge Graph concepts.


Capabilities

Format Read Write Style Templates
DOCX
XLSX
PDF ⚠️

⚠️ = PDF generation via DOCX → PDF export only


Document Formats

DOCX (Word Documents)

Use Cases:

Key Features:


XLSX (Excel Spreadsheets)

Use Cases:

Key Features:


PDF (Portable Documents)

Use Cases:

Key Features:

Limitations:


Core Operations

1. Creating Documents

DOCX Creation

1
2
3
4
5
6
from python-docx import Document

doc = Document()
doc.add_heading('SEA-Forge™ ADR', 0)
doc.add_paragraph('Decision: Adopt Knowledge Graph')
doc.save('adr-001.docx')

XLSX Creation

1
2
3
4
5
6
7
from openpyxl import Workbook

wb = Workbook()
ws = wb.active
ws['A1'] = 'Concept'
ws['B1'] = 'Status'
wb.save('traceability.xlsx')

2. Reading Documents

DOCX Reading

1
2
3
4
5
from python-docx import Document

doc = Document('existing.docx')
for paragraph in doc.paragraphs:
    print(paragraph.text)

XLSX Reading

1
2
3
4
5
6
from openpyxl import load_workbook

wb = load_workbook('data.xlsx')
ws = wb.active
for row in ws.iter_rows(values_only=True):
    print(row)

PDF Reading

1
2
3
4
5
import pypdf

reader = pypdf.PdfReader('source.pdf')
for page in reader.pages:
    print(page.extract_text())

3. Styling Documents

DOCX Styles

1
2
3
4
5
6
7
8
9
from python-docx.shared import Pt, RGBColor
from python-docx.enum.text import WD_PARAGRAPH_ALIGNMENT

paragraph = doc.add_paragraph('Important Text')
run = paragraph.runs[0]
run.bold = True
run.font.size = Pt(14)
run.font.color.rgb = RGBColor(0, 0, 255)
paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER

XLSX Styles

1
2
3
4
5
from openpyxl.styles import Font, PatternFill, Alignment

ws['A1'].font = Font(bold=True, size=14)
ws['A1'].fill = PatternFill(start_color='FFFF00', fill_type='solid')
ws['A1'].alignment = Alignment(horizontal='center')

4. Tables

DOCX Tables

1
2
3
4
5
6
7
table = doc.add_table(rows=3, cols=3)
table.style = 'Light Grid Accent 1'

header_cells = table.rows[0].cells
header_cells[0].text = 'Concept'
header_cells[1].text = 'Status'
header_cells[2].text = 'Owner'

XLSX Tables

1
2
3
4
5
6
7
8
from openpyxl.worksheet.table import Table, TableStyleInfo

ws.append(['Concept', 'Status', 'Owner'])
ws.append(['BoundedContext', 'Active', 'TeamA'])

tab = Table(displayName='ConceptTable', ref='A1:C2')
tab.tableStyleInfo = TableStyleInfo(name='TableStyleMedium2')
ws.add_table(tab)

5. Formulas & Charts

XLSX Formulas

1
2
3
ws['D2'] = '=SUM(B2:C2)'
ws['D3'] = '=AVERAGE(B2:B10)'
ws['D4'] = '=IF(B2>100,"High","Low")'

XLSX Charts

1
2
3
4
5
6
7
8
from openpyxl.chart import BarChart, Reference

chart = BarChart()
data = Reference(ws, min_col=2, min_row=1, max_row=10)
categories = Reference(ws, min_col=1, min_row=2, max_row=10)
chart.add_data(data, titles_from_data=True)
chart.set_categories(categories)
ws.add_chart(chart, 'E5')

Pattern Library

Pattern 1: ADR Export to DOCX

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def export_adr_to_docx(adr_md_path, output_docx_path):
    """Convert ADR markdown to formatted DOCX."""
    import markdown
    from python-docx import Document

    # Read markdown
    with open(adr_md_path, 'r') as f:
        md_content = f.read()

    # Parse sections
    doc = Document()
    doc.add_heading('SEA-Forge™ Architecture Decision Record', 0)

    # Add sections (title, status, context, decision, consequences)
    # [Implementation details omitted for brevity]

    doc.save(output_docx_path)

Pattern 2: Traceability Matrix to XLSX

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def generate_traceability_matrix(mapping_data, output_xlsx_path):
    """Generate traceability matrix spreadsheet."""
    from openpyxl import Workbook
    from openpyxl.styles import Font, PatternFill

    wb = Workbook()
    ws = wb.active
    ws.title = 'Traceability'

    # Header row
    headers = ['ADR ID', 'PRD IDs', 'SDS IDs', 'Coverage']
    for idx, header in enumerate(headers, start=1):
        cell = ws.cell(row=1, column=idx, value=header)
        cell.font = Font(bold=True)
        cell.fill = PatternFill(start_color='366092', fill_type='solid')

    # Data rows
    for row_idx, mapping in enumerate(mapping_data, start=2):
        ws.cell(row=row_idx, column=1, value=mapping['adr'])
        ws.cell(row=row_idx, column=2, value=', '.join(mapping['prds']))
        ws.cell(row=row_idx, column=3, value=', '.join(mapping['sdss']))
        ws.cell(row=row_idx, column=4, value=f"=IF(C{row_idx}<>\"\", \"\", \"\")")

    wb.save(output_xlsx_path)

Pattern 3: PDF Content Extraction for RAG

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def extract_pdf_for_indexing(pdf_path):
    """Extract PDF content with metadata for Knowledge Graph indexing."""
    import pypdf

    reader = pypdf.PdfReader(pdf_path)

    extracted = {
        'metadata': reader.metadata,
        'pages': []
    }

    for page_num, page in enumerate(reader.pages):
        extracted['pages'].append({
            'page_num': page_num + 1,
            'text': page.extract_text(),
            'links': page.get('/Annots', [])  # Extract hyperlinks
        })

    return extracted

Pattern 4: Metrics Dashboard to XLSX

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def generate_metrics_dashboard(metrics, output_xlsx_path):
    """Generate metrics dashboard with charts."""
    from openpyxl import Workbook
    from openpyxl.chart import LineChart, Reference

    wb = Workbook()
    ws = wb.active
    ws.title = 'Metrics'

    # Write data
    ws.append(['Date', 'Velocity', 'Quality', 'Coverage'])
    for metric in metrics:
        ws.append([metric['date'], metric['velocity'], metric['quality'], metric['coverage']])

    # Create chart
    chart = LineChart()
    chart.title = 'Sprint Metrics'
    data = Reference(ws, min_col=2, min_row=1, max_col=4, max_row=len(metrics)+1)
    categories = Reference(ws, min_col=1, min_row=2, max_row=len(metrics)+1)
    chart.add_data(data, titles_from_data=True)
    chart.set_categories(categories)
    ws.add_chart(chart, 'F5')

    wb.save(output_xlsx_path)

Best Practices

1. Semantic Anchoring

Link document content to Knowledge Graph concepts:

1
2
3
# Embed ConceptId as document property
doc.core_properties.subject = 'sea:BoundedContext'
doc.core_properties.keywords = 'ADR-021, SDS-012, PRD-026'

2. Template-Based Generation

Use templates for consistency:

1
2
3
4
5
from python-docx import Document

template = Document('templates/adr-template.docx')
# Modify template placeholders
template.save('adr-new.docx')

3. Version Control

Track document versions:

1
2
3
doc.core_properties.version = '1.0.0'
doc.core_properties.revision = 1
doc.core_properties.modified = datetime.now()

4. Accessibility

Ensure documents are accessible:

1
2
3
4
5
6
# Add alt text to images
doc.add_picture('diagram.png', description='Architecture diagram showing bounded contexts')

# Use proper heading hierarchy
doc.add_heading('Section 1', level=1)
doc.add_heading('Subsection 1.1', level=2)

Troubleshooting

Issue: DOCX style not applying

Symptom: Style changes don’t appear in output

Solution:

1
2
3
4
5
6
7
# Ensure style exists in template
from python-docx.oxml.shared import OxmlElement

def add_style_if_missing(doc, style_name):
    if style_name not in doc.styles:
        # Add style programmatically
        pass

Issue: XLSX formula not calculating

Symptom: Formula shows as text, not result

Solution:

1
2
3
# Set cell data type explicitly
ws['D2'].value = '=SUM(B2:C2)'
ws['D2'].data_type = 'f'  # 'f' for formula

Issue: PDF extraction incomplete

Symptom: Missing text or garbled output

Solution:

1
2
3
4
5
6
# Try alternative PDF library
import pdfplumber

with pdfplumber.open('document.pdf') as pdf:
    for page in pdf.pages:
        text = page.extract_text(layout=True)  # Preserve layout


Appendix: Library References