Content is user-generated and unverified.

SAS Regex Comprehensive Course

Core SAS Regex Functions

PRXMATCH - Pattern Matching

sas
data test;
    text = "Phone: 555-123-4567";
    /* Find phone number pattern */
    pos = prxmatch('/\d{3}-\d{3}-\d{4}/', text);
    put pos=;  /* Output: pos=8 */
run;

PRXCHANGE - Pattern Replacement

sas
data clean_data;
    input name $50.;
    /* Remove extra spaces */
    clean_name = prxchange('s/\s+/ /g', -1, strip(name));
    datalines;
John    Smith
Mary  Jane   Doe
;
/* Output: 
   John Smith
   Mary Jane Doe */
run;

PRXPARSE - Compile Patterns

sas
data _null_;
    retain pattern_id;
    if _n_ = 1 then pattern_id = prxparse('/\b\w+@\w+\.\w+\b/');
    
    text = "Contact: john.doe@company.com for info";
    pos = prxmatch(pattern_id, text);
    if pos > 0 then do;
        email = prxposn(pattern_id, 0, text);
        put email=;  /* Output: email=john.doe@company.com */
    end;
run;

Real-World Data Cleaning Examples

Extract Product Codes

sas
data products;
    input description $100.;
    
    /* Extract product code pattern: ABC-1234 */
    pattern_id = prxparse('/[A-Z]{3}-\d{4}/');
    pos = prxmatch(pattern_id, description);
    
    if pos > 0 then 
        product_code = prxposn(pattern_id, 0, description);
    else 
        product_code = 'MISSING';
        
    datalines;
Item ABC-1234 Blue Widget $29.99
Special offer DEF-5678 Red Gadget
No code available for this item
Widget GHI-9012 Green Version 2.0
;
/* Output:
   ABC-1234
   DEF-5678
   MISSING
   GHI-9012 */
run;

Validate and Clean Phone Numbers

sas
data phone_cleanup;
    input raw_phone $20.;
    
    /* Remove non-digits */
    digits_only = prxchange('s/\D//g', -1, raw_phone);
    
    /* Format as (XXX) XXX-XXXX if 10 digits */
    if length(digits_only) = 10 then do;
        area_code = substr(digits_only, 1, 3);
        exchange = substr(digits_only, 4, 3);
        number = substr(digits_only, 7, 4);
        formatted_phone = cats('(', area_code, ') ', exchange, '-', number);
    end;
    else formatted_phone = 'INVALID';
    
    datalines;
555-123-4567
(555) 987-6543
555.111.2222
5551234567
555-12-345
;
/* Output:
   (555) 123-4567
   (555) 987-6543
   (555) 111-2222
   (555) 123-4567
   INVALID */
run;

Extract Multiple Email Addresses

sas
data email_extract;
    input text $200.;
    
    email_pattern = prxparse('/\b[\w.-]+@[\w.-]+\.\w+\b/');
    
    /* Find all emails in text */
    start = 1;
    email_count = 0;
    
    do while(prxmatch(email_pattern, substr(text, start)) > 0);
        pos = prxmatch(email_pattern, substr(text, start));
        email_count + 1;
        found_email = prxposn(email_pattern, 0, substr(text, start));
        
        /* Create variables email1, email2, etc. */
        call symputx(cats('email', email_count), found_email);
        
        start = start + pos + length(found_email) - 1;
    end;
    
    datalines;
Contact john@company.com or mary.smith@corp.net for details
Send reports to admin@system.org and backup@system.org
No email addresses in this text
;
run;

Advanced Pattern Matching

Extract Structured Data from Free Text

sas
data transaction_parse;
    input transaction $200.;
    
    /* Pattern for: DATE AMOUNT DESCRIPTION */
    pattern = prxparse('/(\d{2}\/\d{2}\/\d{4})\s+\$?(\d+\.\d{2})\s+(.+)/');
    
    if prxmatch(pattern, transaction) then do;
        trans_date = prxposn(pattern, 1, transaction);
        amount = input(prxposn(pattern, 2, transaction), 8.2);
        description = strip(prxposn(pattern, 3, transaction));
        parsed = 'YES';
    end;
    else parsed = 'NO';
    
    datalines;
12/15/2023 $125.50 Grocery Store Purchase
01/03/2024 45.00 Gas Station
Invalid transaction format
11/30/2023 $1,250.75 Monthly Rent Payment
;
/* Output:
   trans_date=12/15/2023, amount=125.5, description=Grocery Store Purchase
   trans_date=01/03/2024, amount=45, description=Gas Station
   parsed=NO
   (Note: Won't parse $1,250.75 due to comma) */
run;

Complex Name Parsing

sas
data name_parse;
    input full_name $50.;
    
    /* Pattern for: First [Middle] Last [Suffix] */
    name_pattern = prxparse('/^(\w+)\s+(?:(\w+)\s+)?(\w+)(?:\s+(Jr\.?|Sr\.?|III?|IV))?$/i');
    
    if prxmatch(name_pattern, strip(full_name)) then do;
        first_name = prxposn(name_pattern, 1, full_name);
        middle_name = prxposn(name_pattern, 2, full_name);
        last_name = prxposn(name_pattern, 3, full_name);
        suffix = prxposn(name_pattern, 4, full_name);
        
        if missing(middle_name) then middle_name = '';
        if missing(suffix) then suffix = '';
    end;
    
    datalines;
John Smith
Mary Jane Watson
Robert Smith Jr.
Elizabeth Anne Johnson III
;
run;

Data Validation Patterns

Credit Card Number Validation

sas
data cc_validation;
    input cc_number $20.;
    
    /* Remove spaces and dashes */
    clean_cc = prxchange('s/[\s-]//g', -1, cc_number);
    
    /* Validate patterns */
    visa_pattern = prxparse('/^4\d{15}$/');
    mc_pattern = prxparse('/^5[1-5]\d{14}$/');
    amex_pattern = prxparse('/^3[47]\d{13}$/');
    
    if prxmatch(visa_pattern, clean_cc) then cc_type = 'VISA';
    else if prxmatch(mc_pattern, clean_cc) then cc_type = 'MASTERCARD';
    else if prxmatch(amex_pattern, clean_cc) then cc_type = 'AMEX';
    else cc_type = 'INVALID';
    
    datalines;
4111-1111-1111-1111
5555 5555 5555 4444
3782 822463 10005
1234567890123456
;
/* Output:
   VISA
   MASTERCARD
   AMEX
   INVALID */
run;

Social Security Number Validation

sas
data ssn_validation;
    input ssn $15.;
    
    /* Valid SSN pattern: XXX-XX-XXXX */
    ssn_pattern = prxparse('/^\d{3}-\d{2}-\d{4}$/');
    
    if prxmatch(ssn_pattern, ssn) then do;
        valid_format = 'YES';
        
        /* Extract parts */
        area = substr(ssn, 1, 3);
        group = substr(ssn, 5, 2);
        serial = substr(ssn, 8, 4);
        
        /* Basic validation rules */
        if area in ('000', '666') or area >= '900' then valid_ssn = 'NO';
        else if group = '00' then valid_ssn = 'NO';
        else if serial = '0000' then valid_ssn = 'NO';
        else valid_ssn = 'YES';
    end;
    else do;
        valid_format = 'NO';
        valid_ssn = 'NO';
    end;
    
    datalines;
123-45-6789
000-12-3456
123-00-4567
123-45-0000
12345-6789
;
run;

Log File Analysis

Parse Web Server Logs

sas
data log_analysis;
    input log_line $300.;
    
    /* Common Log Format pattern */
    log_pattern = prxparse('/^(\S+)\s+\S+\s+\S+\s+\[([^\]]+)\]\s+"(\w+)\s+(\S+)\s+\S+"\s+(\d+)\s+(\d+|-)/');
    
    if prxmatch(log_pattern, log_line) then do;
        ip_address = prxposn(log_pattern, 1, log_line);
        timestamp = prxposn(log_pattern, 2, log_line);
        method = prxposn(log_pattern, 3, log_line);
        url = prxposn(log_pattern, 4, log_line);
        status_code = input(prxposn(log_pattern, 5, log_line), 8.);
        bytes_sent = prxposn(log_pattern, 6, log_line);
        
        /* Categorize status codes */
        if status_code < 300 then status_category = 'SUCCESS';
        else if status_code < 400 then status_category = 'REDIRECT';
        else if status_code < 500 then status_category = 'CLIENT_ERROR';
        else status_category = 'SERVER_ERROR';
    end;
    
    datalines;
192.168.1.1 - - [25/Dec/2023:10:15:32 +0000] "GET /index.html HTTP/1.1" 200 1234
10.0.0.5 - - [25/Dec/2023:10:16:45 +0000] "POST /api/data HTTP/1.1" 404 512
;
run;

Performance Optimization

Compiled Patterns for Large Datasets

sas
data large_dataset;
    /* Simulate large dataset */
    do i = 1 to 1000000;
        text = cats('Record ', i, ' with email user', i, '@domain.com');
        output;
    end;
run;

data extract_emails;
    set large_dataset;
    
    /* Compile pattern once, retain across observations */
    retain email_pattern;
    if _n_ = 1 then email_pattern = prxparse('/\b[\w.-]+@[\w.-]+\.\w+\b/');
    
    if prxmatch(email_pattern, text) then 
        email = prxposn(email_pattern, 0, text);
    
    drop email_pattern;
run;

Using PERL Regular Expressions in PROC SQL

sas
proc sql;
    create table phone_numbers as
    select name, phone,
           case 
               when prxmatch('/^\(\d{3}\)\s?\d{3}-\d{4}$/', strip(phone)) then 'VALID'
               when prxmatch('/^\d{3}-\d{3}-\d{4}$/', strip(phone)) then 'VALID'
               when prxmatch('/^\d{10}$/', compress(phone)) then 'NEEDS_FORMAT'
               else 'INVALID'
           end as phone_status
    from contact_list
    where prxmatch('/\d/', phone) > 0;  /* Only records with digits */
quit;

Advanced Use Cases

Extract Financial Data from Text

sas
data financial_extract;
    input report_text $500.;
    
    /* Pattern for currency amounts */
    currency_pattern = prxparse('/\$[\d,]+\.?\d*/');
    
    /* Pattern for percentages */
    percent_pattern = prxparse('/\d+\.?\d*%/');
    
    /* Extract all currency amounts */
    start = 1;
    amount_count = 0;
    total_amount = 0;
    
    do while(prxmatch(currency_pattern, substr(report_text, start)) > 0);
        pos = prxmatch(currency_pattern, substr(report_text, start));
        amount_count + 1;
        found_amount = prxposn(currency_pattern, 0, substr(report_text, start));
        
        /* Convert to numeric */
        numeric_amount = input(compress(found_amount, '$,'), 12.2);
        total_amount + numeric_amount;
        
        start = start + pos + length(found_amount) - 1;
    end;
    
    /* Extract percentage */
    if prxmatch(percent_pattern, report_text) then 
        percentage = prxposn(percent_pattern, 0, report_text);
    
    datalines;
Revenue increased by $125,000 with expenses of $45,500 showing 15.5% growth
Total sales: $2,500,000 profit margin improved to 8.2%
;
run;

Data Masking with Regex

sas
data masked_data;
    input sensitive_info $200.;
    
    /* Mask SSN: XXX-XX-1234 -> XXX-XX-****  */
    masked_ssn = prxchange('s/(\d{3}-\d{2}-)\d{4}/$1****/g', -1, sensitive_info);
    
    /* Mask credit cards: show only last 4 digits */
    masked_cc = prxchange('s/\b(\d{4}[\s-]?)(\d{4}[\s-]?)(\d{4}[\s-]?)(\d{4})\b/****-****-****-$4/g', -1, masked_ssn);
    
    /* Mask email domains */
    final_masked = prxchange('s/(@)[\w.-]+(\.\w+)/$1****$2/g', -1, masked_cc);
    
    datalines;
Customer SSN: 123-45-6789 CC: 4111-1111-1111-1111 Email: john@company.com
Account 987-65-4321 Card 5555-5555-5555-4444 Contact mary@corp.net
;
run;

Error Handling and Debugging

Pattern Testing and Validation

sas
%macro test_pattern(pattern, test_string);
    data _null_;
        pattern_id = prxparse("&pattern");
        if pattern_id > 0 then do;
            result = prxmatch(pattern_id, "&test_string");
            if result > 0 then 
                put "MATCH: &test_string at position " result;
            else 
                put "NO MATCH: &test_string";
        end;
        else put "ERROR: Invalid pattern &pattern";
    run;
%mend;

/* Test different patterns */
%test_pattern('/\d{3}-\d{2}-\d{4}/', 123-45-6789);
%test_pattern('/\b\w+@\w+\.\w+\b/', john@company.com);
%test_pattern('/[invalid/', test);  /* Invalid pattern */

Common Regex Pitfalls in SAS

sas
data regex_pitfalls;
    input test_data $50.;
    
    /* WRONG: Escaping issues */
    /* wrong_pattern = '/\d+\.\d+/';  /* Single backslash */
    
    /* CORRECT: Double backslash in SAS */
    correct_pattern = prxparse('/\\d+\\.\\d+/');
    
    /* WRONG: Not anchoring when needed */
    loose_match = prxmatch('/\\d{3}/', test_data);  /* Matches 123 in 1234567 */
    
    /* CORRECT: Anchored match */
    exact_match = prxmatch('/^\\d{3}$/', test_data);  /* Matches exactly 3 digits */
    
    datalines;
123
1234
abc123def
;
run;

Clinical Trials Programming Use Cases

SDTM Domain Data Cleaning

Adverse Events (AE) Domain - MedDRA Coding

sas
data ae_clean;
    input usubjid $20. aeterm $200.;
    
    /* Standardize AE terms - remove extra spaces, fix case */
    clean_term = propcase(prxchange('s/\s+/ /g', -1, strip(aeterm)));
    
    /* Extract severity from AE term */
    severity_pattern = prxparse('/\b(mild|moderate|severe|serious)\b/i');
    if prxmatch(severity_pattern, aeterm) then do;
        aesev = upcase(prxposn(severity_pattern, 0, aeterm));
        /* Remove severity from term */
        clean_aeterm = prxchange('s/\b(mild|moderate|severe|serious)\b//i', -1, clean_term);
        clean_aeterm = prxchange('s/\s+/ /g', -1, strip(clean_aeterm));
    end;
    else do;
        aesev = '';
        clean_aeterm = clean_term;
    end;
    
    /* Extract body system from structured terms */
    body_sys_pattern = prxparse('/^([^:]+):\s*(.+)/');
    if prxmatch(body_sys_pattern, clean_aeterm) then do;
        aebodsys = strip(prxposn(body_sys_pattern, 1, clean_aeterm));
        aedecod = strip(prxposn(body_sys_pattern, 2, clean_aeterm));
    end;
    
    datalines;
STUDY001-001 HEADACHE   SEVERE
STUDY001-002 Gastrointestinal: Nausea mild
STUDY001-003    fatigue     moderate
STUDY001-004 Respiratory: shortness of breath
STUDY001-005 RASH SERIOUS
;
/* Output:
   STUDY001-001: aesev=SEVERE, clean_aeterm=Headache
   STUDY001-002: aebodsys=Gastrointestinal, aedecod=Nausea
   STUDY001-003: aesev=MODERATE, clean_aeterm=Fatigue */
run;

Laboratory Data (LB) - Reference Range Parsing

sas
data lb_parse;
    input usubjid $20. lbtest $30. lborres $20. lbornrlo $20. lbornrhi $20.;
    
    /* Parse reference ranges like "5.0-10.5", "<=5.0", ">=2.1" */
    range_pattern = prxparse('/^([<>=]*)\s*(\d+\.?\d*)\s*-?\s*(\d+\.?\d*)?/');
    
    /* Parse LBORRES for numeric extraction */
    result_pattern = prxparse('/([<>=]*)\s*(\d+\.?\d*)/');
    
    /* Extract numeric result */
    if prxmatch(result_pattern, lborres) then do;
        lbstresc = prxposn(result_pattern, 2, lborres);
        lbstresn = input(lbstresc, 8.2);
        
        /* Extract comparison operator */
        operator = prxposn(result_pattern, 1, lborres);
        if operator ne '' then lbstresc = cats(operator, lbstresn);
    end;
    
    /* Parse reference range low */
    if prxmatch(range_pattern, lbornrlo) then 
        lbstnrlo = input(prxposn(range_pattern, 2, lbornrlo), 8.2);
    
    /* Parse reference range high */  
    if prxmatch(range_pattern, lbornrhi) then 
        lbstnrhi = input(prxposn(range_pattern, 2, lbornrhi), 8.2);
    
    /* Determine abnormal flags */
    if lbstresn ne . then do;
        if lbstnrlo ne . and lbstresn < lbstnrlo then lbnrind = 'LOW';
        else if lbstnrhi ne . and lbstresn > lbstnrhi then lbnrind = 'HIGH';
        else if lbstnrlo ne . and lbstnrhi ne . then lbnrind = 'NORMAL';
        else lbnrind = '';
    end;
    
    datalines;
STUDY001-001 Hemoglobin 12.5 10.0 16.0
STUDY001-002 Glucose <5.0 3.5 7.0
STUDY001-003 Creatinine >=2.5 0.5 1.2
STUDY001-004 ALT 45 <=40 <=40
;
/* Output:
   STUDY001-001: lbstresn=12.5, lbnrind=NORMAL
   STUDY001-002: lbstresn=5.0, lbnrind=HIGH, lbstresc=<5.0
   STUDY001-003: lbstresn=2.5, lbnrind=HIGH, lbstresc=>=2.5 */
run;

Concomitant Medications (CM) - Drug Name Standardization

sas
data cm_standardize;
    input usubjid $20. cmtrt $100.;
    
    /* Extract drug name and strength */
    drug_pattern = prxparse('/^([^0-9]+?)\s*(\d+\.?\d*\s*mg|mcg|g|ml|units?)\s*(.*)/i');
    
    if prxmatch(drug_pattern, strip(cmtrt)) then do;
        cmdecod = strip(propcase(prxposn(drug_pattern, 1, cmtrt)));
        cmdose = prxposn(drug_pattern, 2, cmtrt);
        cmroute = strip(upcase(prxposn(drug_pattern, 3, cmtrt)));
        
        /* Clean drug name - remove common suffixes */
        cmdecod = prxchange('s/\s+(tablet|capsule|injection|cream|gel)$//i', -1, cmdecod);
        
        /* Extract numeric dose */
        dose_num_pattern = prxparse('/(\d+\.?\d*)/');
        if prxmatch(dose_num_pattern, cmdose) then 
            cmdostxt = prxposn(dose_num_pattern, 0, cmdose);
            
        /* Extract dose unit */
        unit_pattern = prxparse('/(mg|mcg|g|ml|units?)/i');
        if prxmatch(unit_pattern, cmdose) then 
            cmdosu = upcase(prxposn(unit_pattern, 0, cmdose));
    end;
    else do;
        cmdecod = propcase(strip(cmtrt));
        cmdose = '';
        cmroute = '';
    end;
    
    /* Standardize route of administration */
    if index(upcase(cmroute), 'ORAL') or index(upcase(cmroute), 'PO') then cmroute = 'ORAL';
    else if index(upcase(cmroute), 'IV') or index(upcase(cmroute), 'INTRAVENOUS') then cmroute = 'INTRAVENOUS';
    else if index(upcase(cmroute), 'IM') or index(upcase(cmroute), 'INTRAMUSCULAR') then cmroute = 'INTRAMUSCULAR';
    else if index(upcase(cmroute), 'TOPICAL') then cmroute = 'TOPICAL';
    
    datalines;
STUDY001-001 Acetaminophen 500mg tablet oral
STUDY001-002 lisinopril 10 mg PO daily
STUDY001-003 Insulin 10 units subcutaneous
STUDY001-004 IBUPROFEN 200MG CAPSULE
STUDY001-005 Hydrocortisone 1% cream topical
;
/* Output:
   STUDY001-001: cmdecod=Acetaminophen, cmdostxt=500, cmdosu=MG, cmroute=ORAL
   STUDY001-002: cmdecod=Lisinopril, cmdostxt=10, cmdosu=MG, cmroute=ORAL
   STUDY001-003: cmdecod=Insulin, cmdostxt=10, cmdosu=UNITS, cmroute=SUBCUTANEOUS */
run;

CDISC Compliance and Validation

Subject ID Validation

sas
data dm_validation;
    input usubjid $20.;
    
    /* Standard format: STUDY###-###-### */
    subjid_pattern = prxparse('/^([A-Z]+\d{3})-(\d{3})-(\d{3})$/');
    
    if prxmatch(subjid_pattern, usubjid) then do;
        studyid = prxposn(subjid_pattern, 1, usubjid);
        siteid = prxposn(subjid_pattern, 2, usubjid);
        subjid_num = prxposn(subjid_pattern, 3, usubjid);
        format_valid = 'Y';
        
        /* Check for sequential numbering issues */
        if input(subjid_num, 8.) = . then seq_valid = 'N';
        else seq_valid = 'Y';
    end;
    else do;
        format_valid = 'N';
        seq_valid = 'N';
        
        /* Try to extract study ID from malformed USUBJID */
        study_extract = prxchange('s/^([A-Z]+\d{3}).*/$1/', -1, usubjid);
        if study_extract ne usubjid then studyid = study_extract;
    end;
    
    datalines;
STUDY001-001-001
STUDY001-001-002
INVALID-SUBJID
STUDY002-999-ABC
STUDY001-001-1
;
/* Output:
   STUDY001-001-001: format_valid=Y, studyid=STUDY001, siteid=001
   INVALID-SUBJID: format_valid=N
   STUDY002-999-ABC: format_valid=N, seq_valid=N */
run;

Visit Window Validation

sas
data visit_validation;
    input usubjid $20. visitnum 8. visit $30. svstdtc $20.;
    
    /* Extract visit day from visit description */
    day_pattern = prxparse('/Day\s+(\d+)/i');
    week_pattern = prxparse('/Week\s+(\d+)/i');
    month_pattern = prxparse('/Month\s+(\d+)/i');
    
    if prxmatch(day_pattern, visit) then do;
        visit_day = input(prxposn(day_pattern, 1, visit), 8.);
        visit_type = 'DAY';
    end;
    else if prxmatch(week_pattern, visit) then do;
        visit_day = input(prxposn(week_pattern, 1, visit), 8.) * 7;
        visit_type = 'WEEK';
    end;
    else if prxmatch(month_pattern, visit) then do;
        visit_day = input(prxposn(month_pattern, 1, visit), 8.) * 30;
        visit_type = 'MONTH';
    end;
    else if upcase(visit) = 'SCREENING' then do;
        visit_day = -14;
        visit_type = 'SCREENING';
    end;
    else if upcase(visit) = 'BASELINE' then do;
        visit_day = 0;
        visit_type = 'BASELINE';
    end;
    
    /* Validate visit numbering sequence */
    if visitnum ne . and visit_day ne . then do;
        if visitnum = 1 and visit_day ne -14 then visit_seq_issue = 'Y';
        else if visitnum = 2 and visit_day ne 0 then visit_seq_issue = 'Y';
        else visit_seq_issue = 'N';
    end;
    
    datalines;
STUDY001-001-001 1 Screening 2023-01-15
STUDY001-001-001 2 Baseline 2023-01-29
STUDY001-001-001 3 Day 7 2023-02-05
STUDY001-001-001 4 Week 4 2023-02-26
STUDY001-001-001 5 Month 3 2023-04-29
;
/* Output:
   Visit 1: visit_day=-14, visit_type=SCREENING
   Visit 3: visit_day=7, visit_type=DAY
   Visit 4: visit_day=28, visit_type=WEEK */
run;

Medical History and Prior Procedures

ICD-10 Code Validation

sas
data mh_icd10;
    input usubjid $20. mhterm $100. mhdecod $10.;
    
    /* ICD-10 pattern: Letter followed by 2 digits, optional decimal and 1-2 more digits */
    icd10_pattern = prxparse('/^[A-Z]\d{2}(\.?\d{1,2})?$/');
    
    /* ICD-10-CM pattern (US specific) */
    icd10cm_pattern = prxparse('/^[A-Z]\d{2}\.[A-Z0-9]{1,4}$/');
    
    if prxmatch(icd10_pattern, mhdecod) then do;
        icd10_valid = 'Y';
        
        /* Extract category */
        category = substr(mhdecod, 1, 1);
        
        /* Categorize by ICD-10 chapter */
        select(category);
            when('A','B') icd10_chapter = 'Infectious diseases';
            when('C','D') icd10_chapter = 'Neoplasms/Blood disorders';
            when('E') icd10_chapter = 'Endocrine/Metabolic';
            when('F') icd10_chapter = 'Mental/Behavioral';
            when('G') icd10_chapter = 'Nervous system';
            when('H') icd10_chapter = 'Eye/Ear';
            when('I') icd10_chapter = 'Circulatory';
            when('J') icd10_chapter = 'Respiratory';
            when('K') icd10_chapter = 'Digestive';
            when('L') icd10_chapter = 'Skin';
            when('M') icd10_chapter = 'Musculoskeletal';
            when('N') icd10_chapter = 'Genitourinary';
            otherwise icd10_chapter = 'Other';
        end;
    end;
    else do;
        icd10_valid = 'N';
        icd10_chapter = '';
    end;
    
    datalines;
STUDY001-001-001 Hypertension I10
STUDY001-001-001 Type 2 Diabetes E11.9
STUDY001-001-001 Depression F32.1
STUDY001-001-001 Asthma J45
STUDY001-001-001 Back Pain INVALID
;
/* Output:
   I10: icd10_valid=Y, icd10_chapter=Circulatory
   E11.9: icd10_valid=Y, icd10_chapter=Endocrine/Metabolic
   INVALID: icd10_valid=N */
run;

Safety and Efficacy Analysis

SAE Narrative Text Mining

sas
data sae_analysis;
    input usubjid $20. aeterm $50. narrative $500.;
    
    /* Extract timeline information */
    timeline_pattern = prxparse('/(\d+)\s+(day|week|month|hour)s?\s+(after|before|since)/i');
    
    if prxmatch(timeline_pattern, narrative) then do;
        time_value = input(prxposn(timeline_pattern, 1, narrative), 8.);
        time_unit = upcase(prxposn(timeline_pattern, 2, narrative));
        time_relation = upcase(prxposn(timeline_pattern, 3, narrative));
        
        /* Convert to days */
        select(time_unit);
            when('HOUR') timeline_days = time_value / 24;
            when('DAY') timeline_days = time_value;
            when('WEEK') timeline_days = time_value * 7;
            when('MONTH') timeline_days = time_value * 30;
        end;
    end;
    
    /* Check for hospitalization */
    hosp_pattern = prxparse('/hospital|admit|emergency|ER|ICU/i');
    hospitalized = ifc(prxmatch(hosp_pattern, narrative) > 0, 'Y', 'N');
    
    /* Extract outcome information */
    outcome_pattern = prxparse('/\b(resolved|recovering|ongoing|fatal|death)\b/i');
    if prxmatch(outcome_pattern, narrative) then 
        outcome = upcase(prxposn(outcome_pattern, 0, narrative));
    
    /* Drug relationship assessment */
    relation_pattern = prxparse('/\b(related|unrelated|possibly related|probably related)\b/i');
    if prxmatch(relation_pattern, narrative) then 
        causality = propcase(prxposn(relation_pattern, 0, narrative));
    
    datalines;
STUDY001-001-001 Chest Pain Patient developed chest pain 3 days after starting study drug. Admitted to hospital for evaluation. Event resolved after 5 days and was assessed as possibly related to study medication.
STUDY001-002-002 Syncope Subject experienced syncope 2 weeks since last dose. Taken to emergency room. Currently recovering. Unrelated to study drug.
;
/* Output:
   001-001: time_value=3, time_unit=DAY, hospitalized=Y, outcome=RESOLVED, causality=Possibly Related
   002-002: time_value=2, time_unit=WEEK, hospitalized=Y, outcome=RECOVERING, causality=Unrelated */
run;

Data Transfer and Integration

External Lab Data Integration

sas
data lab_integration;
    input raw_data $300.;
    
    /* Parse pipe-delimited lab results */
    /* Format: SubjectID|TestName|Result|Units|RefLow|RefHigh|Date */
    lab_pattern = prxparse('/^([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]*)\|([^|]*)\|([^|]+)$/');
    
    if prxmatch(lab_pattern, raw_data) then do;
        ext_subjid = strip(prxposn(lab_pattern, 1, raw_data));
        ext_testname = strip(prxposn(lab_pattern, 2, raw_data));
        ext_result = strip(prxposn(lab_pattern, 3, raw_data));
        ext_units = strip(prxposn(lab_pattern, 4, raw_data));
        ext_reflow = strip(prxposn(lab_pattern, 5, raw_data));
        ext_refhigh = strip(prxposn(lab_pattern, 6, raw_data));
        ext_date = strip(prxposn(lab_pattern, 7, raw_data));
        
        /* Map external subject ID to study format */
        if prxmatch('/^\d{4}$/', ext_subjid) then 
            usubjid = cats('STUDY001-001-', put(input(ext_subjid, 8.), z3.));
        
        /* Standardize test names */
        select(upcase(ext_testname));
            when('HGB', 'HEMOGLOBIN') lbtest = 'Hemoglobin';
            when('HCT', 'HEMATOCRIT') lbtest = 'Hematocrit';
            when('WBC', 'WHITE BLOOD CELLS') lbtest = 'White Blood Cell Count';
            when('GLUC', 'GLUCOSE') lbtest = 'Glucose';
            when('CREAT', 'CREATININE') lbtest = 'Creatinine';
            otherwise lbtest = propcase(ext_testname);
        end;
        
        parsed_successfully = 'Y';
    end;
    else parsed_successfully = 'N';
    
    datalines;
1001|HGB|12.5|g/dL|12.0|16.0|2023-03-15
1002|GLUC|95|mg/dL|70|100|2023-03-15
1003|Invalid format missing pipes
1004|WBC|7.2|K/uL|4.0|11.0|2023-03-16
;
/* Output:
   1001: usubjid=STUDY001-001-001, lbtest=Hemoglobin, parsed_successfully=Y
   1003: parsed_successfully=N */
run;

Regulatory Submission Preparation

SDTM Comment Field Validation

sas
data sdtm_comments;
    input domain $2. variable $8. comment $200.;
    
    /* Check for required comment patterns for regulatory submission */
    
    /* Missing value explanation */
    missing_pattern = prxparse('/\bNOT\s+(DONE|PERFORMED|COLLECTED|AVAILABLE)\b/i');
    
    /* Imputation flag */
    impute_pattern = prxparse('/\b(IMPUTED|ESTIMATED|DERIVED)\b/i');
    
    /* Protocol deviation */
    deviation_pattern = prxparse('/\b(DEVIATION|VIOLATION|OUT\s+OF\s+WINDOW)\b/i');
    
    /* Data query resolution */
    query_pattern = prxparse('/\b(QUERY|CLARIFICATION|CORRECTION)\b/i');
    
    comment_type = '';
    if prxmatch(missing_pattern, comment) then comment_type = 'MISSING_DATA';
    else if prxmatch(impute_pattern, comment) then comment_type = 'IMPUTATION';
    else if prxmatch(deviation_pattern, comment) then comment_type = 'DEVIATION';
    else if prxmatch(query_pattern, comment) then comment_type = 'QUERY_RESOLUTION';
    else comment_type = 'OTHER';
    
    /* Check comment length (regulatory requirement) */
    comment_length = length(strip(comment));
    if comment_length > 200 then comment_length_flag = 'EXCEEDS_LIMIT';
    else comment_length_flag = 'OK';
    
    datalines;
VS VSORRES Visit not performed due to subject illness
LB LBORRES Lab sample not collected - subject missed visit
EG EGORRES ECG reading imputed from previous visit
AE AESTDTC Date estimated based on subject diary
CM CMSTDTC Medication start date query resolved with site
;
/* Output:
   VS: comment_type=MISSING_DATA
   LB: comment_type=MISSING_DATA  
   EG: comment_type=IMPUTATION
   AE: comment_type=IMPUTATION */
run;

Quick Reference

Common SAS Regex Patterns

sas
/* Email: */          /\b[\w.-]+@[\w.-]+\.\w+\b/
/* Phone: */         /\d{3}-\d{3}-\d{4}/
/* SSN: */           /\d{3}-\d{2}-\d{4}/
/* Credit Card: */   /\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}/
/* IP Address: */    /\b(?:\d{1,3}\.){3}\d{1,3}\b/
/* URL: */           /https?:\/\/[\w.-]+\.\w+/
/* Currency: */      /\$[\d,]+\.?\d*/
/* Date MM/DD/YYYY: */ /\d{1,2}\/\d{1,2}\/\d{4}/
/* Alphanumeric: */  /^[A-Za-z0-9]+$/
/* Remove spaces: */ s/\s+/ /g

Exhaustive Pattern Library

Personal Identifiers

sas
/* Social Security Numbers */
/^\d{3}-\d{2}-\d{4}$/                    /* XXX-XX-XXXX */
/^\d{3}\s\d{2}\s\d{4}$/                  /* XXX XX XXXX */
/^\d{9}$/                                /* XXXXXXXXX */

/* Driver's License (US States) */
/^[A-Z]\d{7}$/                          /* CA format: A1234567 */
/^[A-Z]{2}\d{6}$/                       /* NY format: AB123456 */
/^\d{8}$/                               /* FL format: 12345678 */
/^[A-Z]\d{8}$/                          /* TX format: A12345678 */

/* Passport Numbers */
/^[A-Z]\d{8}$/                          /* US: A12345678 */
/^[A-Z]{2}\d{7}$/                       /* UK: AB1234567 */
/^\d{2}[A-Z]{2}\d{5}$/                  /* German: 12AB12345 */

/* Tax ID Numbers */
/^\d{2}-\d{7}$/                         /* EIN: XX-XXXXXXX */
/^\d{3}-\d{2}-\d{4}$/                   /* ITIN: XXX-XX-XXXX */

Contact Information

sas
/* Phone Numbers - International */
/^\+1\s?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$/    /* US: +1 (555) 123-4567 */
/^\+44\s?\d{4}\s?\d{6}$/                         /* UK: +44 1234 567890 */
/^\+49\s?\d{3}\s?\d{8}$/                         /* Germany: +49 123 12345678 */
/^\+33\s?\d{1}\s?\d{8}$/                         /* France: +33 1 12345678 */
/^\+86\s?\d{3}\s?\d{8}$/                         /* China: +86 138 12345678 */

/* Phone Numbers - Domestic Formats */
/^\(\d{3}\)\s?\d{3}-\d{4}$/             /* (555) 123-4567 */
/^\d{3}-\d{3}-\d{4}$/                   /* 555-123-4567 */
/^\d{3}\.\d{3}\.\d{4}$/                 /* 555.123.4567 */
/^\d{10}$/                              /* 5551234567 */
/^1-\d{3}-\d{3}-\d{4}$/                 /* 1-555-123-4567 */

/* Email Addresses - Basic to Complex */
/^\w+@\w+\.\w+$/                        /* Basic: user@domain.com */
/^[\w.-]+@[\w.-]+\.\w{2,}$/             /* Standard: user.name@sub.domain.com */
/^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/  /* RFC compliant */
/^[\w._%+-]+@[\w.-]+\.\w{2,}(?:\.\w{2,})?$/         /* Multiple TLDs */

/* Addresses */
/^\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Drive|Dr|Lane|Ln|Boulevard|Blvd)\.?$/i
/^P\.?O\.?\s+Box\s+\d+$/i               /* PO Box 1234 */
/^\d{5}(?:-\d{4})?$/                    /* ZIP: 12345 or 12345-6789 */
/^[A-Z]\d[A-Z]\s?\d[A-Z]\d$/            /* Canadian Postal: A1B 2C3 */

Financial Data

sas
/* Credit Card Numbers */
/^4\d{15}$/                             /* Visa: 16 digits starting with 4 */
/^5[1-5]\d{14}$/                        /* Mastercard: 16 digits starting 51-55 */
/^3[47]\d{13}$/                         /* Amex: 15 digits starting 34/37 */
/^6(?:011|5\d{2})\d{12}$/               /* Discover: 16 digits */
/^35(?:2[89]|[3-8]\d)\d{12}$/           /* JCB */

/* Bank Account Numbers */
/^\d{8,17}$/                            /* US Bank Account */
/^\d{2}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{2}$/  /* IBAN */
/^[A-Z]{4}[A-Z]{2}\d{2}[A-Z0-9]{12}$/   /* SWIFT/BIC */

/* Currency Amounts */
/^\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?$/    /* USD: $1,234.56 */
/^€\d{1,3}(?:\.\d{3})*(?:,\d{2})?$/     /* EUR: €1.234,56 */
/^£\d{1,3}(?:,\d{3})*(?:\.\d{2})?$/     /* GBP: £1,234.56 */
/^¥\d{1,3}(?:,\d{3})*$/                 /* JPY: ¥1,234 */
/^\d+\.\d{2}$/                          /* Decimal: 1234.56 */

/* Investment/Trading */
/^[A-Z]{1,5}$/                          /* Stock Symbol: AAPL */
/^[A-Z]{6}\d{2}[A-Z]\d{8}$/             /* CUSIP */
/^[A-Z0-9]{12}$/                        /* ISIN */

Medical and Healthcare

sas
/* Medical Record Numbers */
/^MR\d{6,8}$/                           /* MR1234567 */
/^\d{7,10}$/                            /* 1234567890 */
/^[A-Z]{2}\d{6}$/                       /* AB123456 */

/* Drug Codes and Identifiers */
/^\d{4}-\d{4}-\d{2}$/                   /* NDC: 1234-5678-90 */
/^\d{5}-\d{3}-\d{2}$/                   /* NDC: 12345-678-90 */
/^\d{5}-\d{4}-\d{1}$/                   /* NDC: 12345-6789-0 */
/^[A-Z]\d{2}(?:\.\d{1,2})?$/            /* ICD-10: A12.34 */
/^\d{3}(?:\.\d{2})?$/                   /* ICD-9: 123.45 */
/^\d{5}$/                               /* CPT: 12345 */

/* Lab Values and Measurements */
/^[<>=≤≥]\s?\d+\.?\d*$/                 /* <5.0, >=10.2 */
/^\d+\.?\d*\s?-\s?\d+\.?\d*$/           /* Range: 5.0-10.5 */
/^(?:NEG|POS|NEGATIVE|POSITIVE)$/i      /* Qualitative results */
/^(?:NORMAL|ABNORMAL|HIGH|LOW)$/i       /* Reference indicators */

/* Dosage and Administration */
/^\d+\.?\d*\s?(mg|mcg|g|ml|units?|IU)$/i /* 500mg, 10.5mcg */
/^(?:QD|BID|TID|QID|Q\d+H)$/i           /* Frequency: BID, Q8H */
/^(?:PO|IV|IM|SC|SL|PR|TOP)$/i          /* Route: PO, IV */

Date and Time Patterns

sas
/* ISO 8601 Formats */
/^\d{4}-\d{2}-\d{2}$/                   /* YYYY-MM-DD */
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/ /* YYYY-MM-DDTHH:MM:SS */
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/ /* UTC */
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}[+-]\d{2}:\d{2}$/ /* Timezone */

/* US Date Formats */
/^\d{1,2}\/\d{1,2}\/\d{4}$/             /* M/D/YYYY or MM/DD/YY */
/^\d{1,2}-\d{1,2}-\d{4}$/               /* M-D-YYYY */
/^\d{2}\/\d{2}\/\d{2}$/                 /* MM/DD/YY */

/* European Date Formats */
/^\d{1,2}\/\d{1,2}\/\d{4}$/             /* D/M/YYYY */
/^\d{2}\.\d{2}\.\d{4}$/                 /* DD.MM.YYYY (German) */

/* Time Formats */
/^\d{1,2}:\d{2}$/                       /* H:MM or HH:MM */
/^\d{1,2}:\d{2}:\d{2}$/                 /* H:MM:SS */
/^\d{1,2}:\d{2}\s?(?:AM|PM)$/i          /* 3:45 PM */
/^(?:[01]\d|2[0-3]):\d{2}:\d{2}$/       /* 24-hour: 23:59:59 */

/* Relative Time */
/^\d+\s?(seconds?|minutes?|hours?|days?|weeks?|months?|years?)\s?ago$/i
/^in\s?\d+\s?(seconds?|minutes?|hours?|days?|weeks?|months?|years?)$/i
/^(?:today|yesterday|tomorrow)$/i

Technical and System Patterns

sas
/* IP Addresses */
/^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$/       /* IPv4: 192.168.1.1 */
/^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$/ /* Valid IPv4 */
/^(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$/ /* IPv6 full */
/^([0-9a-fA-F]{1,4}:){1,7}:$/           /* IPv6 compressed */

/* MAC Addresses */
/^[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}$/
/^[0-9A-Fa-f]{2}-[0-9A-Fa-f]{2}-[0-9A-Fa-f]{2}-[0-9A-Fa-f]{2}-[0-9A-Fa-f]{2}-[0-9A-Fa-f]{2}$/

/* URLs and Domains */
/^https?:\/\/[\w.-]+\.\w{2,}(?:\/[\w.-]*)*\/?$/  /* Basic URL */
/^https?:\/\/(?:www\.)?[\w.-]+\.\w{2,}(?:\/[\w./?#-]*)?$/  /* With www optional */
/^ftp:\/\/[\w.-]+\.\w{2,}(?:\/[\w.-]*)*\/?$/     /* FTP */
/^[\w.-]+\.\w{2,}$/                     /* Domain only */

/* File Paths */
/^[A-Za-z]:\\(?:[^\\/:*?"<>|]+\\)*[^\\/:*?"<>|]*$/ /* Windows: C:\path\file.txt */
/^\/(?:[^\/]+\/)*[^\/]*$/               /* Unix/Linux: /path/file.txt */
/^~\/(?:[^\/]+\/)*[^\/]*$/              /* Home directory: ~/file.txt */

/* Database Identifiers */
/^[a-zA-Z_][a-zA-Z0-9_]*$/              /* Valid SQL identifier */
/^`[^`]+`$/                             /* MySQL quoted identifier */
/^\[[^\]]+\]$/                          /* SQL Server bracketed */

Document and File Patterns

sas
/* File Extensions */
/\.(?:txt|csv|xlsx?|docx?|pdf|pptx?)$/i /* Office documents */
/\.(?:jpg|jpeg|png|gif|bmp|svg)$/i      /* Images */
/\.(?:mp4|avi|mov|wmv|flv|mkv)$/i       /* Videos */
/\.(?:mp3|wav|flac|aac|ogg)$/i          /* Audio */
/\.(?:zip|rar|7z|tar|gz)$/i             /* Archives */

/* Version Numbers */
/^v?\d+\.\d+\.\d+$/                     /* Semantic: v1.2.3 */
/^v?\d+\.\d+\.\d+\.\d+$/                /* Four part: 1.2.3.4 */
/^v?\d+\.\d+\.\d+-(?:alpha|beta|rc)\d*$/i /* Pre-release */

/* License Plates (US) */
/^[A-Z0-9]{2,8}$/                       /* General format */
/^[A-Z]{3}\s?\d{3}$/                    /* ABC 123 */
/^\d{3}\s?[A-Z]{3}$/                    /* 123 ABC */
/^[A-Z]{2}\s?\d{4}$/                    /* AB 1234 */

Business and Industry

sas
/* SKU/Product Codes */
/^[A-Z0-9]{6,12}$/                      /* Alphanumeric SKU */
/^[A-Z]{2,4}-\d{4,8}$/                  /* ABC-12345 */
/^\d{12}$/                              /* UPC-A */
/^\d{8}$/                               /* UPC-E */
/^\d{13}$/                              /* EAN-13 */

/* Invoice/Order Numbers */
/^INV-\d{6,8}$/                         /* INV-123456 */
/^PO-\d{4}-\d{4}$/                      /* PO-2023-1234 */
/^ORD\d{8}$/                            /* ORD12345678 */

/* Company Identifiers */
/^\d{2}-\d{7}$/                         /* EIN: 12-3456789 */
/^[A-Z]{1,4}\d{4,6}$/                   /* Stock ticker with number */
/^D-U-N-S\s?\d{9}$/                     /* DUNS number */

Scientific and Research

sas
/* Chemical Formulas */
/^[A-Z][a-z]?\d*(?:[A-Z][a-z]?\d*)*$/   /* H2O, C6H12O6 */
/^[A-Z][a-z]?(?:\d+)?(?:\([A-Z][a-z]?(?:\d+)?\)\d*)*$/  /* Ca(OH)2 */

/* Gene/Protein Names */
/^[A-Z]{3,5}\d*$/                       /* BRCA1, TP53 */
/^[A-Z][a-z]{2,10}$/                    /* Insulin, Myosin */

/* PubMed IDs */
/^\d{8}$/                               /* 12345678 */
/^PMID:\s?\d{8}$/                       /* PMID: 12345678 */

/* DOI */
/^10\.\d{4,}\/[-._;()\/:a-zA-Z0-9]+$/   /* 10.1000/xyz123 */
/^doi:10\.\d{4,}\/[-._;()\/:a-zA-Z0-9]+$/i  /* doi:10.1000/xyz123 */

/* Coordinates */
/^-?\d{1,3}\.\d+,\s?-?\d{1,3}\.\d+$/    /* Lat,Long: 40.7128,-74.0060 */
/^-?(?:180(?:\.0+)?|(?:1[0-7]\d|\d{1,2})(?:\.\d+)?)$/  /* Valid longitude */
/^-?(?:90(?:\.0+)?|[1-8]?\d(?:\.\d+)?)$/  /* Valid latitude */

Log File and System Patterns

sas
/* Log Levels */
/^\[?(DEBUG|INFO|WARN|ERROR|FATAL)\]?/i  /* [INFO] message */
/^\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}\.\d{3}/  /* Timestamp with ms */

/* HTTP Status Codes */
/^[1-5]\d{2}$/                          /* 200, 404, 500 */
/^2\d{2}$/                              /* Success 2xx */
/^4\d{2}$/                              /* Client error 4xx */
/^5\d{2}$/                              /* Server error 5xx */

/* Server Response Times */
/^\d+(?:\.\d+)?ms$/                     /* 123.45ms */
/^\d+(?:\.\d+)?s$/                      /* 1.23s */

/* Memory/Storage Sizes */
/^\d+(?:\.\d+)?\s?(?:B|KB|MB|GB|TB|PB)$/i  /* 1.5 GB */
/^\d+(?:\.\d+)?\s?(?:bytes?|kilo|mega|giga|tera)$/i

Social Media and Modern Web

sas
/* Social Media Handles */
/^@[A-Za-z0-9_]{1,15}$/                 /* Twitter: @username */
/^#[A-Za-z0-9_]+$/                      /* Hashtag: #hashtag */
/^\/u\/[A-Za-z0-9_-]{3,20}$/            /* Reddit: /u/username */

/* Modern Web Identifiers */
/^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i  /* UUID */
/^[A-Za-z0-9_-]{22}$/                   /* YouTube video ID */
/^[A-Za-z0-9_-]{11}$/                   /* Short YouTube ID */

/* API Keys */
/^[A-Za-z0-9]{32}$/                     /* 32 char API key */
/^[A-Za-z0-9]{40}$/                     /* 40 char API key */
/^[A-Za-z0-9_-]{43}$/                   /* JWT-like format */

Validation and Data Quality

sas
/* Empty/Whitespace */
/^\s*$/                                 /* Empty or whitespace only */
/^\s+/                                  /* Leading whitespace */
/\s+$/                                  /* Trailing whitespace */
/\s{2,}/                                /* Multiple consecutive spaces */

/* Special Characters */
/[^\x00-\x7F]/                          /* Non-ASCII characters */
/[^\w\s]/                               /* Non-alphanumeric, non-space */
/[\x00-\x1F\x7F]/                       /* Control characters */

/* Data Type Validation */
/^-?\d+$/                               /* Integers */
/^-?\d*\.?\d+$/                         /* Numbers (int or decimal) */
/^[+-]?\d*\.?\d+(?:[eE][+-]?\d+)?$/     /* Scientific notation */
/^(?:true|false)$/i                     /* Boolean */
/^(?:yes|no|y|n)$/i                     /* Yes/No variants */

/* Length Validation */
/^.{8,}$/                               /* Minimum 8 characters */
/^.{0,255}$/                            /* Maximum 255 characters */
/^.{8,128}$/                            /* Between 8-128 characters */

Password and Security

sas
/* Password Strength */
/^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)[A-Za-z\d@$!%*?&]{8,}$/  /* Strong password */
/^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{12,}$/  /* Very strong */
/^[A-Za-z\d@$!%*?&]{6,20}$/             /* Basic password */

/* Security Tokens */
/^[A-Za-z0-9+\/]{64}=?=?$/              /* Base64 token */
/^[0-9a-f]{64}$/                        /* SHA-256 hash */
/^[0-9a-f]{32}$/                        /* MD5 hash */

Industry-Specific Patterns

Aviation

sas
/^[A-Z]{2}\d{3,4}$/                     /* Flight number: AA1234 */
/^[A-Z]{3}$/                            /* Airport code: JFK */
/^N\d{1,5}[A-Z]{0,2}$/                  /* Aircraft tail: N12345A */

Healthcare Additional

sas
/^[A-Z]\d{2}\.[A-Z0-9]{1,4}$/           /* ICD-10-CM: A12.123 */
/^\d{5}-\d{4}-\d{2}$/                   /* HCPCS: 12345-1234-12 */
/^[A-Z]{2}\d{8}$/                       /* DEA number: AB1234567 */

Legal

sas
/^\d{2}-cv-\d{5}$/                      /* Court case: 21-cv-12345 */
/^USC\s?\d{2}\s?§\s?\d+$/               /* US Code: USC 42 § 1983 */
/^CFR\s?\d{2}\s?\d+\.\d+$/              /* Code of Federal Regulations */

Education

sas
/^\d{4}-\d{4}$/                         /* Academic year: 2023-2024 */
/^[A-Z]{2,4}\s?\d{3}$/                  /* Course code: MATH 101 */
/^\d{9}$/                               /* Student ID: 123456789 */

Key SAS Functions Summary

  • PRXPARSE() - Compile regex pattern
  • PRXMATCH() - Find pattern position
  • PRXCHANGE() - Replace patterns
  • PRXPOSN() - Extract matched groups
  • PRXPAREN() - Number of capture groups
  • PRXNEXT() - Find next match
Content is user-generated and unverified.
    SAS Regex Comprehensive Course | Claude