2179 lines
95 KiB
TeX
2179 lines
95 KiB
TeX
|
|
|
|
%%% OUTLINE
|
|
|
|
|
|
|
|
|
|
%\documentclass[twocolumn]{article}
|
|
\documentclass{article}
|
|
%\documentclass[twocolumn,10pt]{report}
|
|
\usepackage{graphicx}
|
|
\usepackage{fancyhdr}
|
|
%\usepackage{wassysym}
|
|
\usepackage{tikz}
|
|
\usepackage{amsfonts,amsmath,amsthm}
|
|
\usetikzlibrary{shapes.gates.logic.US,trees,positioning,arrows}
|
|
%\input{../style}
|
|
\usepackage{ifthen}
|
|
\usepackage{lastpage}
|
|
\usetikzlibrary{shapes,snakes}
|
|
\newcommand{\tickYES}{\checkmark}
|
|
\newcommand{\fc}{fault~scenario}
|
|
\newcommand{\fcs}{fault~scenarios}
|
|
\date{}
|
|
%\renewcommand{\encodingdefault}{T1}
|
|
%\renewcommand{\rmdefault}{tnr}
|
|
%\newboolean{paper}
|
|
%\setboolean{paper}{true} % boolvar=true or false
|
|
\newcommand{\derivec}{{D}}
|
|
|
|
%\newcommand{\fti}{{ \ensuremath{4\mA \; \rightarrow \; 20mA} }}
|
|
\newcommand{\fti}{4mA~to~20mA}
|
|
\newcommand{\ftt}{FTTI}
|
|
|
|
\newcommand{\permil}{\ensuremath{{ }^0/_{00}}}
|
|
\newcommand{\oc}{\ensuremath{^{o}{C}}}
|
|
\newcommand{\adctw}{{${\mathcal{ADC}}_{12}$}}
|
|
\newcommand{\adcten}{{${\mathcal{ADC}}_{10}$}}
|
|
\newcommand{\ohms}[1]{\ensuremath{#1\Omega}}
|
|
\newcommand{\fm}{failure~mode}
|
|
\newcommand{\fms}{failure~modes}
|
|
\newcommand{\fg}{functional~grouping}
|
|
\newcommand{\FG}{\mathcal{G}}
|
|
\newcommand{\DC}{\mathcal{DC}}
|
|
\newcommand{\fgs}{functional~groupings}
|
|
\newcommand{\dc}{derived~component}
|
|
\newcommand{\dcs}{derived~components}
|
|
\newcommand{\bc}{base~component}
|
|
\newcommand{\FMMD}{ModularFMEA}
|
|
\newcommand{\bcs}{base~components}
|
|
\newcommand{\irl}{in real life}
|
|
\newcommand{\enc}{\ensuremath{\stackrel{enc}{\longrightarrow}}}
|
|
\newcommand{\pin}{\ensuremath{\stackrel{pi}{\longleftrightarrow}}}
|
|
%\newcommand{\pic}{\em pure~intersection~chain}
|
|
\newcommand{\pic}{\em pair-wise~intersection~chain}
|
|
\newcommand{\wrt}{\em with~respect~to}
|
|
\newcommand{\abslevel}{\ensuremath{\Psi}}
|
|
\newcommand{\fmmdgloss}{\glossary{name={FMMD},description={Failure Mode Modular De-Composition, a bottom-up methodology for incrementally building failure mode models, using a procedure taking functional groups of components and creating derived components representing them, and in turn using the derived components to create higher level functional groups, and so on, that are used to build a failure mode model of a system}}}
|
|
\newcommand{\fmodegloss}{\glossary{name={failure mode},description={The way in which a failure occurs. A component or sub-system may fail in a number of ways, and each of these is a
|
|
failure mode of the component or sub-system}}}
|
|
\newcommand{\fmeagloss}{\glossary{name={FMEA}, description={Failure Mode and Effects analysis (FMEA) is a process where each potential failure mode within a system, is analysed to determine system level failure modes, and to then classify them {\wrt} perceived severity}}}
|
|
\newcommand{\frategloss}{\glossary{name={failure rate}, description={The number of failure within a population (of size N), divided by N over a given time interval}}}
|
|
\newcommand{\pecgloss}{\glossary{name={PEC},description={A Programmable Electronic controller, will typically consist of sensors and actuators interfaced electronically, with some firmware/software component in overall control}}}
|
|
\newcommand{\bcfm}{base~component~failure~mode}
|
|
\newcommand{\cf}[1]{{\tiny \textbf{#1()}}}
|
|
\newcommand{\swhw}{software~hardware}
|
|
\newcommand{\sw}{software}
|
|
\newcommand{\hw}{hardware}
|
|
\newcommand{\uP}{micro~processor}
|
|
\usepackage{color, colortbl}
|
|
\definecolor{Gray}{gray}{0.9}
|
|
\definecolor{LightGray}{gray}{0.97}
|
|
\definecolor{LightCyan}{rgb}{0.88,1,1}
|
|
\definecolor{Blue}{rgb}{0.0,0.0,0.7}
|
|
\definecolor{Red}{rgb}{0.9,0.0,0.0}
|
|
\definecolor{Green}{rgb}{0.0,0.5,0.0}
|
|
|
|
\def\layersep{1.8cm}
|
|
|
|
\newboolean{pld}
|
|
\setboolean{pld}{false} % boolvar=true or false : draw analysis using propositional logic diagrams
|
|
|
|
\newboolean{dag}
|
|
\setboolean{dag}{true} % boolvar=true or false : draw analysis using directed acylic graphs
|
|
|
|
% \setlength{\topmargin}{0in}
|
|
% \setlength{\headheight}{0in}
|
|
% \setlength{\headsep}{0in}
|
|
% \setlength{\textheight}{22cm}
|
|
% \setlength{\textwidth}{18cm}
|
|
% %\setlength{\textheight}{24.35cm}
|
|
% %\setlength{\textwidth}{20cm}
|
|
% \setlength{\oddsidemargin}{0in}
|
|
% \setlength{\evensidemargin}{0in}
|
|
% \setlength{\parindent}{0.0in}
|
|
% %\setlength{\parskip}{6pt}
|
|
% % \setlength{\parskip}{1cm plus4mm minus3mm}
|
|
% \setlength{\parskip}{0pt}
|
|
% \setlength{\parsep}{0pt}
|
|
% \setlength{\headsep}{0pt}
|
|
% \setlength{\topskip}{0pt}
|
|
% \setlength{\topmargin}{0pt}
|
|
% \setlength{\topsep}{0pt}
|
|
% \setlength{\partopsep}{0pt}
|
|
% \setlength{\itemsep}{1pt}
|
|
% \renewcommand\subsection{\@startsection
|
|
% {subsection}{2}{0mm}%
|
|
% {-\baslineskip}
|
|
% {0.5\baselineskip}
|
|
% {\normalfont\normalsize\itshape}}%
|
|
\linespread{1.0}
|
|
|
|
\begin{document}
|
|
%\pagestyle{fancy}
|
|
%\fancyhf{}
|
|
%\fancyhead[LO]{}
|
|
%\fancyhead[RE]{\leftmark}
|
|
|
|
%\cfoot{Page \thepage\ of \pageref{LastPage}}
|
|
%\rfoot{\today}
|
|
%\lhead{Developing a rigorous bottom-up modular static failure mode modelling methodology}
|
|
%\lhead{Developing a rigorous bottom-up modular static failure modelling methodology}
|
|
% numbers at outer edges
|
|
\pagenumbering{arabic} % Arabic page numbers hereafter
|
|
\author{R.Clark$^\star$, A.~Fish$^\dagger$ , C.~Garrett$^\dagger$, J.~Howse$^\dagger$ \\
|
|
$^\star${\em Energy Technology Control, UK. r.clark@energytechnologycontrol.com} \and $^\dagger${\em University of Brighton, UK}
|
|
}
|
|
|
|
%\title{Developing a rigorous bottom-up modular static failure mode modelling methodology}
|
|
\title{Failure Mode Effects Analysis (FMEA) for Software/Hardware Hybrid Systems using a modular bottom-up hierarchical modelling methodology}
|
|
%\nodate
|
|
\maketitle
|
|
|
|
\today
|
|
|
|
\paragraph{Keywords:} static failure mode modelling; safety-critical; software fmea
|
|
%\small
|
|
|
|
\abstract{ % \em
|
|
%\input{abs}%
|
|
The intention of this paper is to demonstrate an FMEA methodology that can
|
|
analyse integrated hardware/software systems and has test efficiency benefits.
|
|
%
|
|
The certification process of safety critical products for European and
|
|
other international standards often demand environmental stress,
|
|
endurance and Electro Magnetic Compatibility (EMC) testing. Theoretical, or 'static testing',
|
|
is often also required.
|
|
|
|
Failure Mode Effects Analysis (FMEA)~\cite{iec60812}, is a bottom-up static testing technique that aims to assess the effect all
|
|
component failure modes on a system.
|
|
%
|
|
It is used both as a design tool (to determine weaknesses), and as a requirement of certification of safety critical products.
|
|
FMEA has been successfully applied to mechanical, electrical and hybrid electro-mechanical systems.
|
|
|
|
|
|
This paper discusses the benefits and drawbacks of current
|
|
FMEA techniques and then proposes a modular FMEA methodology,
|
|
Failure Mode Modular De-Composition (FMMD)~\cite{clark}
|
|
that has the advantages modularity, traceable failure modes throughout the model
|
|
hierarchy, an increase in test efficiency
|
|
and has
|
|
the ability to model integrated hardware and software systems.
|
|
|
|
% Work on software FMEA (SFMEA) is beginning, but
|
|
% at present no technique for SFMEA that
|
|
% integrates hardware and software models % known to the authors
|
|
% exists.
|
|
% %
|
|
%
|
|
% %
|
|
% %Failure modes in components in say a sensor, could be traced
|
|
% %up through the electronics and then through the controlling software.
|
|
% %
|
|
% %Presently Failure Mode Effects Analysis (FMEA), stops at the glass ceiling of the computer program.
|
|
% This paper takes, from the literature, new and emerging methodologies
|
|
% for software FMEA, applies them to a simple example system, and then
|
|
% reaches conclusions about the effectiveness and failure mode
|
|
% coverage of the combined FMEA techniques.
|
|
|
|
To demonstrate FMMD a small, but complete embedded system
|
|
(including both software and hardware)
|
|
is analysed for failure mode effects.
|
|
%, the industry standard
|
|
%{\ft} signalling loop.
|
|
%
|
|
} % abstract
|
|
|
|
|
|
|
|
\section{Introduction}
|
|
|
|
FMEA stands for Failure Mode Effects Analysis.
|
|
%
|
|
All components used to build a system can fail; also
|
|
they may fail in more than one way.
|
|
The ways in which a component can fail, are known as its {\fms}.
|
|
|
|
At its simplest FMEA means taking a {\fm} of a component and predicting
|
|
what problems it may cause for the system it is part of.
|
|
%
|
|
One way the electronic component the resistor can fail for instance, is if it were
|
|
to go open circuit.
|
|
%
|
|
This open circuit could be because it was not soldered on properly and fell off,
|
|
it could have had an internal mechanical fault or it could have been destroyed/burnt~off by too much
|
|
electrical current.
|
|
%
|
|
The cause does not matter.
|
|
%
|
|
The fact that it can fail by going open circuit does.
|
|
%
|
|
This then is one of the {\fms} of a resistor, $OPEN$.
|
|
%
|
|
For instance, an FMEA scenario could be a resistor in a system going $OPEN$. % circuit.
|
|
%
|
|
The investigator examines the electrical circuit with the resistor
|
|
in it, and using logic and reasoning, works out how the circuit would react
|
|
with that resistor failing $OPEN$.
|
|
|
|
%
|
|
If the resistor was part of an amplifier in the circuit
|
|
it could be predicted say, that a particular reading,
|
|
as measured by the amplifiers output, would go outside of an expected
|
|
range.
|
|
%
|
|
The erroneous reading may cause the system to fail dangerously or may simply be detected and flagged
|
|
as a fault.
|
|
%
|
|
The description of the outcome is at the discretion of the Engineer
|
|
responsible for the FMEA report.
|
|
|
|
|
|
The central concept of FMEA is that if all component failures are known,
|
|
by analysing them the failure behaviour of a system can be determined.
|
|
%
|
|
This means looking at every component in the system, and for each of those components
|
|
examining all known failure modes in the context of the system that it is part of.
|
|
%
|
|
Various handbooks and international standards list common components and
|
|
their known failure modes often with accompanying statistics~\cite{en298, fmd91, mil1991}.
|
|
|
|
\subsection{Origins of FMEA techniques}
|
|
%FMEA methodologies trace from the 1940's and were designed to
|
|
%model simple electro-mechanical systems.
|
|
%
|
|
The origins of
|
|
FMEA methodologies were formed %originally designed
|
|
in the 1940's to
|
|
model failure effects in simple electro-mechanical systems.
|
|
%
|
|
Because those early systems were relatively simple,
|
|
%modern FMEA methodologies follow this paradigm and
|
|
they traced component failure modes directly to system level failures.
|
|
There were no concepts of modularity and, at that time, no need to include
|
|
software elements.
|
|
%
|
|
%This paper explores the historical reasons why FMEA is performed in the way it is currently and
|
|
%the new factors placing higher demands upon it.
|
|
%
|
|
A control system designed up to the mid 1960's would typically have had no
|
|
programmatic/software elements.
|
|
|
|
|
|
Software generally sits on top of most modern safety critical control systems
|
|
and defines its most important system wide behaviour and communications.
|
|
%
|
|
A typical control system, be it in a car or a microwave oven in the kitchen
|
|
will generally combine a micro-controller with electronics.
|
|
It will form a hierarchy where low level electronics
|
|
is implemented at the bottom, which prepares input/output (IO)
|
|
signals for/from the micro controller.
|
|
The micro controller will have software to read/send signals to the electronics
|
|
and on top of that a functional software layer where the control algorithms will
|
|
reside.
|
|
%
|
|
Typically at the top of this hierarchy are the \cf{main} and \cf{monitor} functions.
|
|
This hierarchy is represented in figure~\ref{fig:sw_hw_hierarchy}.
|
|
|
|
\begin{figure}[h]+
|
|
\centering
|
|
\includegraphics[width=300pt]{./sw_hw_hierarchy.jpg}
|
|
% sw_hw_hierarchy.jpg: 814x412 pixel, 72dpi, 28.72x14.53 cm, bb=0 0 814 412
|
|
\caption{Software and hardware hierarchy of a typical modern embedded control system.}
|
|
\label{fig:sw_hw_hierarchy}
|
|
\end{figure}
|
|
|
|
|
|
|
|
%
|
|
Currently standards that demand FMEA investigations for hardware FMEA (HFMEA) (e.g. EN298, EN61508),
|
|
do not specify FMEA for software but instead essentially just specify good practise,
|
|
i.e. review processes and language feature constraints.
|
|
%
|
|
That is to say FMEA has no formal framework for following
|
|
failure modes from low level hardware elements through into software models~\cite{sfmeainterface}.
|
|
%
|
|
This is a weakness.
|
|
%
|
|
Where HFMEA % scientifically
|
|
traces component {\fms}
|
|
to resultant system failures, the issue of software until recently, has been ignored.
|
|
Most %left in a non-analytical limbo
|
|
standards that mention software do not have methodologies
|
|
to apply FMEA, instead they prescribe best practises,
|
|
defensive programming strategies, redundancy and constraints~\cite{en61508}.
|
|
%
|
|
Software FMEA (SFMEA) has been proposed
|
|
in several forms~\cite{modelsfmea,sfmea,procsfmeadb,sfmeaauto}.
|
|
%
|
|
|
|
Some work has looked at the software/hardware interface~\cite{sfmeainterface}
|
|
but in general SFMEA is always performed separately from HFMEA.
|
|
%
|
|
What this means is that the FMEA analysis cannot guarantee to handle
|
|
all possible failures from hardware.
|
|
%
|
|
The hardware may even flag that it has self-detected some kind of failure, but
|
|
because the software and hardware analyses are separate
|
|
there is no way the analysis process can guarantee the hardware error will be made known to the software.
|
|
%
|
|
This means that some hardware failure modes are unexpected and therefore
|
|
un-handled by the software.
|
|
This means the system can exhibit unpredictable and possibly dangerous behaviour which
|
|
will not be picked up by the FMEA process.
|
|
%
|
|
%
|
|
% This paper seeks to examine the effectiveness of current and proposed SFMEA
|
|
% techniques, by analysing a simple hybrid hardware/software system,
|
|
% which is in common use and has mature field experience. %
|
|
% %analysing the chosen example, which is well known and understood
|
|
% %
|
|
% Because the chosen example is well understood it is
|
|
% %, this example is
|
|
% useful
|
|
% to compare the results from these FMEA methodologies with
|
|
% the known failure mode behaviour.
|
|
% %from years of field experience, and determining how well the HFMEA and SFMEA
|
|
% %analysis reports model the failure mode behaviour.
|
|
% % %
|
|
%If software and hardware integrated FMEA were possible, electro-mechanical-software hybrids could
|
|
%be modelled, and so we could consider `complete' failure mode models.
|
|
%
|
|
%Presently FMEA, stops at the glass ceiling of the computer program: FMMD seeks to address
|
|
%this, and offers additional test efficiency benefits.
|
|
This paper is a condensed version of the PhD thesis entitled `failure Mode Modular De-compositon'~\cite{clark}. % \today
|
|
|
|
|
|
|
|
%\today
|
|
\nocite{en298}
|
|
\nocite{en61508}
|
|
|
|
|
|
|
|
\section{Introduction}
|
|
{
|
|
%This paper describes a modular FMEA process that can be applied to software.
|
|
%This modular variant of FMEA is called Failure Mode Modular de-composition (FMMD).
|
|
%
|
|
%Because this process is based on failure modes of components,
|
|
%it can be applied to electrical and/or mechanical systems.
|
|
%
|
|
%The hierarchical structure of software is then examined,
|
|
%and definitions from contract programming are used
|
|
%to define failure modes and failure symptoms for
|
|
%software functions.
|
|
%
|
|
%With these definitions we can apply the FMMD modular form of FMEA
|
|
%to existing software\footnote{Existing software excluding recursive~\cite{misra}[16.2] code,
|
|
%and unstructured non-functional language.}.
|
|
}
|
|
|
|
\section{FMEA Background}
|
|
|
|
%What FMEA is, briefly variants...
|
|
|
|
FMEA~\cite{iec60812} is the process of taking
|
|
component failure modes, %and by reasoning,
|
|
tracing their effects through a system
|
|
and determining what system level failure modes could be caused.
|
|
%
|
|
%The certification process of safety critical products for European and
|
|
%other international standards often demand environmental stress, magnetic susceptibility,
|
|
%endurance and Electro Magnetic Compatibility (EMC) testing.
|
|
%
|
|
% Theoretical, or `static~testing', is often also required.
|
|
% %
|
|
% FMEA is a tool used for static testing.
|
|
% %
|
|
% For many types of safety critical system in the European Union, product design testing and FMEA
|
|
% is legally mandatory~\cite{en230,en298}.
|
|
% %
|
|
% %Its use is traditionally only applied to hardware (electrical and mechanical) systems.
|
|
% %
|
|
% %
|
|
% FMEA has its roots in the previous century where simple electro-mechanical systems were the norm.
|
|
% %
|
|
|
|
To perform FMEA, the effects of a component failure mode are examined
|
|
with respect to other components in the system; and from this behaviour
|
|
a system level failure or effect is determined.
|
|
%
|
|
That is an experienced engineer, will take a parts list, look up all the failure modes
|
|
for the parts and typically list them on a spreadsheet with one failure mode per row.
|
|
%
|
|
The Engineer will then, using available component data, circuit schematics and perhaps
|
|
experimental data, determine the effect on the system for each failure mode.
|
|
%
|
|
There will typically be attributes for each failures modes outcome.
|
|
%
|
|
These could include whether the failure is detectable, time to repair and the
|
|
perceived severity of the outcome.
|
|
%
|
|
This report is the end product of an FMEA investigation.
|
|
%
|
|
|
|
|
|
Several variants of FMEA exist,
|
|
but the three in main use are:
|
|
|
|
\begin{itemize}
|
|
|
|
\item Design FMEA (DFMEA) is FMEA applied at the design or approvals stage~\cite{en298, en230}
|
|
where the aim is to ensure that single component failures (at least) cannot
|
|
cause unacceptable system level events~\cite{iec60812,boffin},
|
|
|
|
\item Failure Mode Effect Criticality Analysis (FMECA) is applied to determine the most potentially dangerous or damaging
|
|
failure modes using FMEA in conjunction with severity and failure probability figures~\cite{fmeca,mil1991,fmd91},
|
|
|
|
\item Failure Mode Effects and Diagnostics Analysis, is FMEA performed to
|
|
determine a statistical level of safety. This is a fairly standard FMEA but with statistical values attached to each component {\fm};
|
|
this is associated with the European standard EN61508~\cite{en61508} and is commonly termed Safety Integrity Level (SIL)~\cite{en61511} classification.
|
|
|
|
\end{itemize}
|
|
|
|
|
|
\subsection{Concept of `reasoning~distance'.}
|
|
\label{reasoningdistance}
|
|
%\fmmdglossRD
|
|
|
|
In order to evaluate the efficiency or performance of a process it is
|
|
often useful to count the number of operations required to perform it.
|
|
In computer science sorting algorithms are often classified by the order of operations using big `O' notation.
|
|
A bubble sort for instance has a worst case sorting time proportional to the number $N$ if elements of $O(N^2)$.
|
|
Because of this a programmer is very unlikely to implement a simple bubble sort
|
|
if the number of elements $N$ is predicted to be large.
|
|
|
|
To evaluate FMEA techniques a metric is required.
|
|
%
|
|
When analysing a failure mode of a component, it is reasonable to
|
|
look at how the failure mode will affect the other components in the system and to put this then
|
|
into the context of the systems behaviour.
|
|
%
|
|
Components may fail in several ways. European standard EN298~\cite{en298} gives two possible
|
|
failure modes for a resistor as $OPEN$ and $SHORT$ for instance.
|
|
%
|
|
The term $f$ is defined as the number of component failure modes for a given component.
|
|
%A system will have $N$ number of components.
|
|
|
|
In the case of the resistor $f$ is two
|
|
~\footnote{A resistor is assigned two failure modes by the European Burner standard EN298~\cite{en298}
|
|
as long as some specific safety precautions involving voltage and power ratings are kept.} : $OPEN$ or $SHORT$.
|
|
N is the number of components in the system.
|
|
In order to check this single resistors failure mode then,
|
|
it must be checked twice, for the condition OPEN and then for the condition SHORT, potentially
|
|
against all other components in the system $(N-1)$.
|
|
|
|
For each component then there will be $f (N-1)$ other components that could be affected by
|
|
it failing.
|
|
%
|
|
By counting the number of of checks to make, i.e. failure mode against all other components in the system,
|
|
a metric for evaluating the maximum number of checks that need to be performed for an FMEA is defined.
|
|
|
|
|
|
%
|
|
This count of checks is defined as `reasoning~distance' ---or in other words is --- the number of stages of logic and reasoning used
|
|
in {\fm} analysis to map a failure cause to its potential outcome;
|
|
counted by the number of %{\fm} to
|
|
other components in the system.
|
|
%analysis stages made.
|
|
%
|
|
%The basic FMEA example in section~\ref{basicfmea}
|
|
%considered one {\fm} against some of the components in the milli-volt reader.
|
|
%
|
|
To create an exhaustive FMEA report every
|
|
known failure mode of every component
|
|
within the system would have to be examined against all its other components.
|
|
% %
|
|
% `Reasoning~distance', for one {\fm}, is defined as the number of components checked against it
|
|
% to determine its system level symptom(s).
|
|
%
|
|
No current FMEA variant gives guidelines for the components that should
|
|
be included to analyse a {\fm} in a system.
|
|
%
|
|
Were each {\fm} examined against all the other components in a system
|
|
this would a maximum reasoning distance --- for that particular {\fm}.
|
|
%
|
|
This is termed an exhaustive FMEA case (XFMEA). % case for a single {\fm}.
|
|
%does not
|
|
% The exhaustive~reasoning~distance would be
|
|
% the sum of the number of failure modes, against all other components
|
|
% in that system.
|
|
Thus the exhaustive~reasoning~distance for a particular component
|
|
would be to multiply
|
|
the number of failure modes it has by the number of remaining components
|
|
in the system.
|
|
%
|
|
The exhaustive reasoning~distance for a system would be the
|
|
the sum of these multiplications for all its components. % it contains.
|
|
%
|
|
Take a hypothetical small system with say 100 components, with three failure modes per component,
|
|
%this
|
|
%would give an exhaustive reasoning distance for single failure analysis---of $3 \times 100 \times 99$.
|
|
that means for each {\fm} of every component, i.e. $3$ checks, would have to be made
|
|
against 99 other components. There are 100 components in this hypothetical example
|
|
for single failure analysis this means $3 \times 100 \times 99$ checks.
|
|
%
|
|
This concept of `reasoning~distance' provides a metric to examine
|
|
the state explosion problems associated with FMEA (and other forward search failure investigation
|
|
methodologies).
|
|
%
|
|
%\fmmdglossSTATEEX
|
|
%
|
|
A high reasoning distance, because it is a manual process performed by experts, is
|
|
expensive in both terms of time and money.
|
|
%
|
|
It is apparent also that the shorter the reasoning distance, the more precisely theoretical examination
|
|
can determine failure symptoms. A shorter reasoning distance therefore implies a higher quality of safety analysis.
|
|
%
|
|
|
|
|
|
%
|
|
%.... general concept... simple ideas about how complex a
|
|
%failure analysis is the more modules and components are involved
|
|
% cite for forward and backward search related to safety critical software
|
|
%{sfmeaforwardbackward}
|
|
%\subsection{FMEA and the State Explosion Problem}
|
|
\label{sec:xfmea}
|
|
% \paragraph{Problem of which components to check for a given {\bc} {\fm}.}
|
|
% %\fmmdglossSTATEEX
|
|
% %
|
|
% FMEA for safety critical certification (i.e. for EN298 and EN61508)~\cite{en298,en61508} has to be applied
|
|
% to all known failure modes of all components within a system.
|
|
% %
|
|
% Each one of these, in a typical report, would be one line of a spreadsheet entry.
|
|
% %
|
|
% FMEA does not define or specify the scope of the investigation for each component failure mode.
|
|
% %
|
|
% For instance should the signal path be followed, with all components encountered along that, or should the scope be wider?
|
|
% %
|
|
% %If we wethe effect of a component {\fm} against all other components
|
|
% %in a system, this could be said to be exhaustive analysis.
|
|
|
|
\paragraph{Exhaustive Single Failure FMEA Order equation.}
|
|
%\fmmdglossXFMEA
|
|
%
|
|
To perform XFMEA, every possible interaction
|
|
of a failure mode with all other components in a system would have to be examined.
|
|
%
|
|
Or in other words, all possible failure scenarios considered.
|
|
%
|
|
%to do this completely (all failure modes against all components).
|
|
This is represented in equation~\ref{eqn:fmea_single} below, %~\ref{eqn:fmea_state_exp},
|
|
where $N$ is the total number of components in the system, $RD_{single}$ is the reasoning~distance and
|
|
$f$ is the number of failure modes per component:
|
|
%
|
|
\begin{equation}
|
|
\label{eqn:fmea_single}
|
|
RD_{single} = N.(N-1).f . % \\
|
|
%(N^2 - N).f
|
|
\end{equation}
|
|
%
|
|
This means an order of $O(N^2)$ checks to perform
|
|
to undertake XFMEA for single failures.
|
|
%
|
|
%Even small systems have typically
|
|
%100 components, and they typically have 3 or more failure modes each, which would give
|
|
The hypothetical example described above gives $100 \times 99 \times 3 = 29,700 $ as a reasoning~distance.
|
|
|
|
%%% SANITY CHECK.
|
|
%%%
|
|
When stating a general equation such as equation~\ref{eqn:fmea_single}, it can be sanity checked
|
|
by thinking of common examples.
|
|
For instance a simple amplifier circuit with a handful of components
|
|
would have a low $RD_{single}$ count of potential failure mode to components checks.
|
|
%
|
|
From experience, with a simple amplifier circuit it is relatively easy to predict
|
|
how it would react to well defined component failure modes.
|
|
|
|
For a larger circuit the problems of tracing side effects of the failure mode through the circuit
|
|
mean that it is likely to be a far more complex task.
|
|
%
|
|
The order $O(N^2)$ for FMEA complexity, for single failures, therefore agrees with experience.
|
|
%
|
|
In general terms, for a very simple small circuit, a better understanding of failure effects is expected,
|
|
than for a very large system where there are more variables and potential {\fm} interactions.
|
|
%
|
|
%\fmmdglossSTATEEX
|
|
\paragraph{Exhaustive FMEA and double failure scenarios.}
|
|
%
|
|
%\paragraph{Exhaustive Double Failure FMEA}
|
|
For looking at potential double failure
|
|
scenarios\footnote{Certain double failure scenarios are already legal
|
|
requirements---The European Gas burner standard (EN298:2003~\cite{en298}) for instance---demands the checking of
|
|
double failure scenarios (for burner lock-out scenarios).}
|
|
%
|
|
(two components failing within a given time frame) the order becomes $O(N^3)$.
|
|
Where $RD_{double}$ is the reasoning~distance for double failure scenarios:
|
|
\begin{equation}
|
|
\label{eqn:fmea_double}
|
|
RD_{double} = N.(N-1).(N-2).{f}^{2}% \\
|
|
%(N^2 - N).f
|
|
\end{equation}
|
|
%
|
|
For a theoretical system with 100 components and a fixed 3 failure modes each, this gives reasoning distance of
|
|
$100 \times 99 \times 98 \times 9 = 8,731,800 $. % failure mode scenarios.
|
|
%
|
|
In practise there is an additional complication; that of
|
|
the circuit topology changes that {\fms} can cause.
|
|
Double failure analysis is usually only performed on sections
|
|
of a system considered most critical, and often in the context of redundancy.
|
|
%
|
|
For a combustion controller, it is stated~\cite{en298} that there must be two separate
|
|
fuel shut-off valves, that are controlled from different relays and wiring.
|
|
%
|
|
This is actually more an enforcement of redundancy than FMEA for `any~double~combination' of failure modes.
|
|
|
|
\paragraph{Reliance on experts for meaningful FMEA Analysis.}
|
|
Current FMEA methodologies cannot consider---for the reason of state explosion---an exhaustive approach.
|
|
%We define exhaustive FMEA ({\XFMEA}) as examining the effect of every component failure mode
|
|
%against the remaining components in the system under investigation.
|
|
%
|
|
%\fmmdglossSTATEEX
|
|
%
|
|
%Because for practical reasons,
|
|
In practical terms XFMEA cannot be performed for anything other than a trivial system, instead
|
|
reliance is placed upon experts on the system under investigation
|
|
to perform a meaningful analysis.
|
|
%
|
|
These experts must use their judgement and experience to choose
|
|
sub-sets of the components in the system to check against each {\fm}.
|
|
%
|
|
Also, %In practise
|
|
these experts have to select the areas they see as most critical for detailed FMEA analysis:
|
|
it is usually impossible, for reasons of time to perform the work,
|
|
to action a detailed level of analysis on all component {\fms}
|
|
on anything but a very small %hypothetical
|
|
system (i.e. XFMEA).
|
|
|
|
% \subsection{Component Tolerance}
|
|
%
|
|
% Component tolerances may need considering when determining if a component has failed.
|
|
% Calculations for acceptable ranges to determine failure or acceptable conditions
|
|
% must be made where appropriate.
|
|
% %
|
|
% An example of component tolerance considered for FMEA
|
|
% is given in section~\ref{sec:resistortolerance}.
|
|
|
|
% %\section{FMEA in current usage: Five variants}
|
|
% \section{FMEA in current usage: Four variants}
|
|
%
|
|
% %\paragraph{Five main Variants of FMEA}
|
|
% \paragraph{Four main Variants of FMEA}
|
|
% \begin{itemize}
|
|
% %\item \textbf{PFMEA - Production} Emphasis on cost reduction and product improvement;
|
|
% \item \textbf{FMECA - Criticality} Emphasis on minimising the effect of critical systems failing~\cite{fmeca}; % Military/Space
|
|
% \item \textbf{FMEDA - Statistical Safety} Statistical analysis giving Safety Integrity Levels~\cite{en61508};
|
|
% \item \textbf{DFMEA - Design or Static/Theoretical} Approval of safety critical systems using FMEA and single or double failure prevention~\cite{en298};% EN298/EN230/UL1998
|
|
% \item \textbf{SFMEA - Software FMEA} --- Usage not enforced by most current standards~\cite{en298,en230,en61508}. %only used in highly critical systems at present.
|
|
% \end{itemize}
|
|
|
|
|
|
\nocite{MILSTD1629short}
|
|
|
|
\section{FMEA and modularity.}
|
|
Because modern electronics has become more complex the number
|
|
of basic components in a typical safety critical system has risen dramatically.
|
|
%
|
|
|
|
%
|
|
To add to this components used to fulfil common functions are often Integrated Circuits (ICs).
|
|
%
|
|
Typical examples include voltage regulators, op-amps, micro-controllers~\cite{pic18f2523}, memory modules and
|
|
protocol handlers~\cite{mcp2515}. To build any of these component from scratch would be very expensive and time consuming,
|
|
but these IC `components' have very high internal transistor counts, and each have their own unique
|
|
failure mode behaviour.
|
|
%
|
|
Thus modern electronics has already become too large in scope to sensibly implement the base component failure mode directly mapped to
|
|
a system failure paradigm.
|
|
|
|
\paragraph{Modularity --- breaking large systems into manageable blocks}
|
|
|
|
When faced with complex systems, a typical way to make them
|
|
manageable is to break them into sub-systems, and even sub-systems of sub-systems ad infinitum.
|
|
|
|
|
|
|
|
\paragraph{History of Modularisation in Software}
|
|
%
|
|
It is interesting to compare the development of FMEA methodologies with software.
|
|
%
|
|
Software faced a crisis in complexity in the 1960's where the architecture of
|
|
dominant computer language FORTRAN~\cite{f77} became a limiting factor.
|
|
%
|
|
Programs written in FORTRAN became clumsy when they became large.
|
|
%
|
|
All variables were global.
|
|
%
|
|
A miss-spelled variable could cause chaos.
|
|
%
|
|
Also it was often difficult to pull a function
|
|
out of one program and place it in another if it used some of the global variables.
|
|
|
|
Newer computer languages were invented where modularity was encouraged.
|
|
Instead of FORTRANs global scope for variables, individual functions in a newer languages like `C'
|
|
started to have `local' variables. This meant that
|
|
a programmer could take a function from a `C' program and
|
|
use it in another one without complication.
|
|
%
|
|
Later languages implemented object orientation
|
|
which grouped functions and data together into modules called classes, where
|
|
even the internal local variables of a class could be hidden from the
|
|
programmer using the class.
|
|
%
|
|
For instance the internal workings of a binary~tree or linked~list do no need to be
|
|
accessed if one simply wants to use a class to store data: in this case
|
|
the programmer would pick a ready written and well de-bugged data storage class and simply use it.
|
|
%
|
|
Software expanded in complexity faster than electronics,
|
|
and to cope with this software languages developed modularity
|
|
(function call trees, classes and finally distributed processing mechanisms).
|
|
%
|
|
FMEA has, by necessity, started to include some modular features but none yet
|
|
have defined mechanisms for ensuring that all component failure modes
|
|
are traceable from component to system level. % in the analysis of the module(s) that incorporate it.
|
|
%
|
|
That is to say the process is not rigorous.
|
|
|
|
|
|
|
|
\paragraph{Modularisation in safety analysis in the automotive industry.}
|
|
|
|
The automotive industry, because of mass production, must make products that have high safety integrity %that are very safe but
|
|
% financial pressure keeps their products
|
|
but must also be affordable.
|
|
%
|
|
This leads to specialist firms producing modules, such as automatic braking systems,
|
|
that are bought in and assembled % better word then assembled???? included???
|
|
to make an auto-mobile.
|
|
%
|
|
Performing failure analysis using the basic component single failure modes to
|
|
system failure mapping, would thus be very difficult: this would require expert knowledge
|
|
of the design behaviour and component types used in each module.
|
|
%%
|
|
%Because modern systems have become more complex and now include software elements,
|
|
%modularity
|
|
%of some form (breaking the problem down into smaller sections),
|
|
%has become necessary to break down the state explosion problems associated with FMEA.
|
|
%
|
|
Some modular FMEA techniques are starting to be used and specified, and are described below.
|
|
|
|
\paragraph{Automotive SIL (ASIL) --- modularisation of FMEDA.}
|
|
%
|
|
The EN61508 variant for automotive use, as defined in standard ISO~26262, is known as Automotive SIL (ASIL)~\cite{Kafka20122}.
|
|
%
|
|
Because of the modular approach forced on automotive designers
|
|
a process has been developed called `ASIL~de-composition'~\cite{6464473}.
|
|
%
|
|
This allows automotive designers to use pre-certified modules in their designs
|
|
and applies broad statistical guidelines to achieving particular safety levels by
|
|
use of redundancy and automated diagnostics etc.
|
|
%
|
|
Note that the ASIL modules are given a reliability rating which can be enhanced with redundancy.
|
|
It does not introduce traceable {\fm} reasoning in its hierarchy.
|
|
%%
|
|
%% IN SOFTWARE THIS WOULD BE TIGHTLY COUPLED AS OPPOSED TO LOOSELY COUPLED FUNCTIONS.
|
|
|
|
%
|
|
\paragraph{Indenture levels --- modularisation of FMECA.}
|
|
%
|
|
The US military standard for FMECA~\cite{fmeca}, describes a very broad modularity regime, that
|
|
it terms `indenture' levels.
|
|
%
|
|
Indenture levels are arranged from the top down
|
|
and identify finer and finer grained modules.
|
|
%
|
|
For instance, an aircraft
|
|
may be the first indenture level, and the next may be an identifiable module such as
|
|
an altitude radar: within that finer grained modules may be identified until
|
|
the base components are listed.
|
|
%
|
|
Note that this is a top down approach to modularisation and
|
|
this can introduce errors into the reliability calculations
|
|
by missing out some component failure modes~\cite{MILSTD1629short}.
|
|
%
|
|
|
|
\paragraph{Integrated Circuits (ICs).}
|
|
|
|
Consider some commonly used ICs an op-amp
|
|
is a good example.
|
|
%
|
|
An op-amp will have a high internal component count.
|
|
It is mainly a collection of transistors on a chip
|
|
and is a complex circuit designed to give a very high and precise differential gain.
|
|
%These are made from several components including
|
|
%ransistos, resistors capactors etc.
|
|
In order to perform FMEA op-amps are given
|
|
failure modes in the literature~\cite{fmd91, mil1991}
|
|
as though they are simple base components.
|
|
%
|
|
This is a form of modularisation.
|
|
%
|
|
%It is assumed that with experience and analysis
|
|
%the op-amp failure modes were compiled.
|
|
%
|
|
This has effectively become a precedent %starting point
|
|
for modularisation of FMEA. % by stealth!
|
|
%
|
|
%It also sets .
|
|
If it is acceptable to model a complex IC as a component, assigning it a set of failure modes,
|
|
it should be possible to analyse sections of a larger circuit
|
|
and treat those sections as components in their own right.
|
|
|
|
%
|
|
% \paragraph{Top Down or Bottom-up?}
|
|
% % Because FMEA is a bottom up technique, applying a top down analysis (as in FMECAs indenture levels)
|
|
% % cannot guarantee to consider all component failure modes in the correct context.
|
|
% % %
|
|
% A top down approach (such as FTA) can miss~\cite{faa}[Ch.~9] individual failure modes of components,
|
|
% especially where there are non-obvious or unexpected top-level failures.
|
|
% %
|
|
% In order to ensure that every failure mode is considered, a bottom-up approach
|
|
% including every base components {\fms} must be used.
|
|
% %
|
|
% Going back to the software analogy, the indenture levels of FMECA are similar to
|
|
% a software call tree where the highest indenture levels would be leaf functions.
|
|
% %
|
|
% There is no equivalent of the software `class'.
|
|
% %
|
|
% In the real world however there are.
|
|
% Off the shelf sensors can be purchased which communicate using standard protocols~\cite{Pfeiffer:2003:ENC:1199616}. % consider CANOpen standard sensors, these are%~\footnote{CANopen sensors...}
|
|
% %modules connected by an industrial data bus.
|
|
% %
|
|
% These not only typically have electrical and mechanical
|
|
% components, they have a firmware and communication bus aspects~\cite{canspec, caninauto}.
|
|
% %
|
|
% These type of modules combine hardware, electronics, software, communications
|
|
% and distributed programming.
|
|
% %
|
|
% Current FMEA techniques struggle with software alone, and also, fail to integrate the analysis of hardware and software
|
|
% systems~\cite{sfmea, embedsfmea, modelsfmea, sfmeaa}. %, sfmeainterface }.
|
|
%
|
|
|
|
%
|
|
|
|
%
|
|
|
|
|
|
%
|
|
%Software FMEA techniques have been proposed
|
|
|
|
|
|
%FMMD is a modularisation of FMEA and can produce failure~mode models that can be used in
|
|
%all the above variants of FMEA.
|
|
\subsection{The problem of Systems using software and FMEA}
|
|
|
|
Software systems are becoming part of everyday life.
|
|
It is getting increasingly rare to find systems where there is not a computer
|
|
controlling some part of it.
|
|
All modern airliners are fly-by wire. The throttle in a modern car is fly-by wire.
|
|
|
|
|
|
Because software and hardware FMEAs are separate, tracing failure effects
|
|
from hardware into software, or even ensuring that all predicted
|
|
hardware failure modes have been handled in software is difficult.
|
|
%
|
|
This problem is recognised and work has been undertaken to
|
|
begin to redress this problem.
|
|
|
|
|
|
\paragraph{Current work on Software FMEA.}
|
|
|
|
SFMEA usually does not seek to integrate
|
|
hardware and software models, but to perform
|
|
FMEA on the software in isolation~\cite{procsfmea}.
|
|
%
|
|
Work has been performed using databases
|
|
to track the relationships between variables
|
|
and system failure modes~\cite{procsfmeadb}, to %work has been performed to
|
|
introduce automation into the FMEA process~\cite{appswfmea} and to provide code analysis
|
|
automation~\cite{modelsfmea}.
|
|
%
|
|
Although the SFMEA and hardware FMEAs are performed separately,
|
|
some schools of thought aim for Fault Tree Analysis (FTA)~\cite{nasafta,nucfta} (top down - deductive)
|
|
and FMEA (bottom-up inductive)
|
|
to be performed on the same system to provide insight into the
|
|
software hardware/interface~\cite{embedsfmea}.
|
|
%
|
|
Although this
|
|
would give a better picture of the failure mode behaviour, it
|
|
is by no means a rigorous approach to tracing errors that may occur in hardware
|
|
through to the top (and therefore ultimately controlling) layer of software.
|
|
|
|
\subsection{Current FMEA techniques are not suitable for software}
|
|
|
|
The main FMEA methodologies are all based on the concept of taking
|
|
base component {\fms}, and translating them into system level events/failures~\cite{sfmea,sfmeaa}.
|
|
%
|
|
That is there is only one stage of reasoning between the low level component {\fm} and
|
|
the system level symptom of failure leaves ample room for error.
|
|
%
|
|
In a complicated system, mapping a component failure mode to a system level failure
|
|
will mean a long reasoning distance; that is to say the actions of the
|
|
failed component will have to be traced through
|
|
several sub-systems, gauging its effects with and on other components.
|
|
%
|
|
With software at the higher levels of these sub-systems,
|
|
this introduces another layer of complication.
|
|
%
|
|
%In order to integrate software, %in a meaningful way
|
|
%we need to re-think the
|
|
%FMEA concept of simply mapping a base component failure to a system level event.
|
|
%
|
|
% SFMEA regards, in place of hardware components, the variables used by the programs to be their equivalent~\cite{procsfmea}.
|
|
% The failure modes of these variables, are that they could become erroneously over-written,
|
|
% calculated incorrectly (due to a mistake by the programmer, or a fault in the micro-processor on which it is running), or
|
|
% external influences such as
|
|
% ionising radiation causing bits to be erroneously altered.
|
|
However It is desirable to trace failure modes effects through the hardware and software interfaces.
|
|
This is for two reasons.
|
|
%
|
|
The first is to ensure that the software can detect all possible
|
|
hardware failures, and secondly that the software actually reacts appropriately.
|
|
|
|
|
|
|
|
|
|
\section{FMEA defeciences and `wishlist'}
|
|
|
|
%\subsection{FMEA - General Criticism}
|
|
A summary of deficiencies in current FMEA methodologies are listed below:
|
|
\begin{itemize}
|
|
%\item FMEA type methodologies were designed for simple electro-mechanical systems of the 1940's to 1960's,
|
|
\item State explosion - %impossible
|
|
very difficult/time consuming to perform FMEA exhaustively, %rigorously
|
|
\item Difficult to re-use previous analysis work~\cite{rudov2009language},
|
|
\item Very difficult to model simultaneous/multiple failures,
|
|
\item Software and hardware models are treated separately (if the software is modelled at all) meaning the software interface may not be correctly modelled,
|
|
%\item reasoning distance -- component failure to system level symptom process is undefined in regard
|
|
%to the components to check against each given component {\fm},
|
|
\item FMEA methodologies are undefined in regard to scope, i.e. which components to check against given failure modes,
|
|
%
|
|
\item Distributed real time systems are very difficult to analyse with FMEA because they typically involve many hardware/software interfaces.
|
|
\end{itemize}
|
|
|
|
Traditional forms of FMEA are no longer % fit for purpose!
|
|
of meaningful use for complex modern systems especially those incorporating programmatic elements.
|
|
%
|
|
They were designed to analyse simple electro-mechanical systems
|
|
and even common place high component count analogue circuits (that are usually surface mount and therefore physically small), are
|
|
getting too complicated for meaningful analysis using FMEA.
|
|
%
|
|
% With surface mount technology and increasingly dense integrated circuitry, electronics generally
|
|
% has much higher component counts and more complex components than those in use when FMEA
|
|
% was designed.
|
|
%
|
|
From the above deficiencies, a wish list for a better FMEA is presented, stating the features that should exist
|
|
in an improved FMEA methodology,
|
|
\begin{itemize}
|
|
\item Must be able to analyse hybrid software/hardware systems,
|
|
\item avoid state explosion (i.e. XFMEA is impractical by hand~\cite{cbds}),
|
|
\item encourage exhaustive checking within each module, %(total failure coverage within {\fgs} all interacting component and failure modes checked),
|
|
\item traceable reasoning inherent in system failure models,% to aid repeatability and checking,
|
|
\item re-usable i.e. it should be possible to re-use analysis,
|
|
\item possibility to analyse simultaneous/multiple failures,
|
|
%\item one to one mapping from {\bc} {\fms} to system level failures (see section~\ref{sec:onetoone}),
|
|
\item able to model a system built with bought in sub-systems --- i.e. usable in a distributed system.
|
|
% \item
|
|
\end{itemize}
|
|
|
|
|
|
\section{Proposed Methodology: Failure Mode Modular De-composition (FMMD)}
|
|
|
|
The basic concept behind FMMD is to, from the bottom-up, modularise the problem.
|
|
|
|
FMEA cannot easily be modularised from the top-down, because
|
|
it has to deal with component failure modes.
|
|
%
|
|
It may seem bit counter intuitive, but this means that if FMEA is to be modularised
|
|
it must be done from the bottom up.
|
|
This may seem like a strange idea, but consider how an engineer would look
|
|
at an electronic circuit/schematic.
|
|
%
|
|
The Engineer might, for instance, trace an input signal
|
|
into some other components following a connection on the schematic.
|
|
%
|
|
The Engineer, would typically then following signal paths, try to figure out what
|
|
those components did.
|
|
%
|
|
For instance were it an amplifier, the engineer would
|
|
recognise the electronic configuration,
|
|
and maybe get a calculator out and determine its gain
|
|
or some other property, by looking at the other components connected to it.
|
|
%
|
|
This is a form of modularisation from the bottom-up.
|
|
%
|
|
The Engineer has identified a module, an input amplifier.
|
|
By identifying other modules at component level
|
|
in the circuit, these modules can then be merged to form
|
|
bigger modules until there is a hierarchy and one final module representing the whole system.
|
|
|
|
|
|
\paragraph{Broadly FMMD is modularisation from the bottom-up of FMEA.}
|
|
|
|
Firstly modules are identified (for instance common circuitry formations such as amplifiers or digital outputs) and
|
|
then failure mode analysis is performed on them.
|
|
%
|
|
By analysing a small group of components as a module
|
|
the ways in which the module can fail can be listed.
|
|
%
|
|
This gives a set of symptoms of failure for the module.
|
|
%
|
|
This in effect allows the module to be treated as a component, or {\dc}.
|
|
%
|
|
When the lower levels have been analysed, modules can be brought
|
|
together to form larger modules using the lower ones as through they were
|
|
components.
|
|
%
|
|
These modules can be brought together to form even larger modules.
|
|
|
|
Eventually there is one large module which represents the entire system.
|
|
|
|
Because the terms module and sub-system are quite general term, and possibly over-used,
|
|
a new term has been used to take their place in FMMD.
|
|
%
|
|
This is the `functional~group'.
|
|
%
|
|
Quite simply when identifying a group of components that perform a particular task
|
|
the term `functional~group' describes it as a group that performs a function.
|
|
%
|
|
It also means that a function~group can contain other functional~groups without
|
|
dragging along the semantic baggage that comes with the terms `module' and 'sub-system'.
|
|
|
|
|
|
\section{The proposed Methodology: description}
|
|
\label{fmmdproc}
|
|
%
|
|
%% One line
|
|
The basic concept of FMMD is to modularise FMEA from the bottom-up: by choosing groups of components that
|
|
work together to perform a given function: the failure modes of the components
|
|
are analysed, and a failure mode behaviour for the group determined: this group
|
|
can now be used as a component in its own right with a set of its own failure modes.
|
|
%
|
|
In essence, this methodology beginning with low level modules (or {\fgs})
|
|
which are analysed and assigned a failure mode behaviour.
|
|
They are then considered as higher level components with
|
|
their own failure mode behaviour. These higher level components
|
|
are then collected to form {\fgs} and so on until a hierarchy is built
|
|
representing the entire system.
|
|
%
|
|
This means that failure modes can be traced through linking the
|
|
{\fgs}. This means that the system level {\fms} can be traced back to
|
|
the component {\fms} that can cause them.
|
|
%
|
|
This gives rigorous failure mode traceability through the model.
|
|
|
|
|
|
%
|
|
Any new static failure mode methodology must ensure that it
|
|
represents all component failure modes and it therefore should be bottom-up,
|
|
starting with individual component failure modes.
|
|
%
|
|
That way, all component failure modes must be considered.
|
|
%
|
|
If you modularise from the top down, it is not naturally follow
|
|
bottom-level component failure modes would be handled/used.
|
|
%
|
|
Starting at the bottom means having to deal with each component failure mode from the beginning.
|
|
|
|
\subsection{The proposed Methodology: quick guide or `how~to'.}
|
|
|
|
An FMEA typically begins with a parts list and then from that a series
|
|
of entries for each component failure mode.
|
|
Often these will be listed in the order they are found
|
|
on the parts list.
|
|
|
|
With FMMD a different approach is taken.
|
|
The Engineer will examine the circuit schematic and look for {\fgs}.
|
|
That is small collections of components that work together
|
|
to perform a function.
|
|
Once the circuit has been analysed so that all components
|
|
have been collected into a {\fg} the first stage of analysis begins.
|
|
%
|
|
Each {\fg} is analysed and its symptoms of failure are listed.
|
|
It is then treated as a {\dc} and given a name.
|
|
%
|
|
Where there are repeated sections of circuitry these
|
|
could share the same name but take an index number (for instance
|
|
were a circuit to contain several ${\fti}$ inputs
|
|
they could be named ${\ftt}_1$ ${\ftt}_2$ etc.).
|
|
This also means the analysis of the ${\fti}$ circuit {\fg} need only be performed once.
|
|
%
|
|
This gives the first stage of {\dcs}.
|
|
|
|
These {\dcs} are now treated as components and used to form {\fgs}.
|
|
Eventually a hierarchy will be built until the whole
|
|
system is included. The top level failure symptoms are the ways in which the system can fail.
|
|
|
|
An advantage of this, is that all component failure modes must be considered
|
|
in terms of their effects as the system goes from the
|
|
lower levels through to more abstract system level failures.
|
|
%
|
|
This can lead to surprises.
|
|
|
|
%
|
|
Often when a system is evaluated
|
|
by FMMD a list of system level failures can include ones
|
|
that are not currently dealt with or even detectable
|
|
without some re-design.
|
|
%
|
|
Having surprises at the design
|
|
and not during beta~test (or even in~the~field) is a very good thing
|
|
when dealing with safety critical systems!
|
|
|
|
|
|
Because the ways in which a software function can fail can be listed
|
|
it too can be treated as an FMMD {\fg}.
|
|
%Software functions are treated as components as well, and
|
|
%treat the hardware they interface to (if any) as components.
|
|
A software functions `components' are the software functions it calls
|
|
and the hardware elements it interfaces to (if any).
|
|
Eventually
|
|
all software hierarchies must reach down to hardware in order to react with the real world.
|
|
|
|
An example of a hardware low level analysis is given in~\cite{syssafe2011} and a combined
|
|
software hardware sub-system in~\cite{syssafe2012}. Examples of both, including analysis of performance
|
|
can be found in~\cite{clark}.
|
|
|
|
FMMD is described in more detail in the section below.
|
|
|
|
\subsection{FMMD process detailed description}
|
|
|
|
To ensure all component failure modes are modelled and traceable through stages of analysis, the new methodology must be bottom-up.
|
|
%
|
|
%This seems essential to satisfy criterion 2.
|
|
%The proposed methodology is therefore a bottom-up process
|
|
A {\em {\fg}}, is defined as a small collection of components
|
|
that interact to provide
|
|
a function or task within a system.
|
|
%
|
|
Starting with {\bcs} small {\fgs} are chosen and each component failure mode considered in the
|
|
context of the {\fg}.
|
|
%
|
|
%% GARK
|
|
%
|
|
The component failures are termed {\em{\fcs}}. %`test~cases'.
|
|
For each {\fc}
|
|
there will be a corresponding resultant failure, or `symptom', from the perspective of the {\fg}.
|
|
%
|
|
% MAYBE NEED TO DESCRIBE WHAT A SYMPTOM IS HERE
|
|
%
|
|
%From the perspective of the {\fg} failures of components will be symptoms.
|
|
It is conjectured that many symptoms will be common. That is to say
|
|
that component failures will often cause the same symptoms of failure
|
|
from the perspective of a {\fg}.
|
|
%
|
|
%
|
|
A common symptom collection stage is now applied. Here common symptoms are collected
|
|
from the results of the {\fcs}.
|
|
%Because it is possible to model combinations of failures, criterion 6 is satisfied.
|
|
%
|
|
With a collection of the {\fg} failure symptoms, a new component, a {\em{\dc}} is created.
|
|
The failure modes of this new {\dc} are the symptoms of the {\fg} it was derived from.
|
|
%This satisfies criterion 4, as we can now treat {\dcs} as pre-analysed
|
|
%modules available for re-use.
|
|
%
|
|
%
|
|
By using {\dcs} in higher level functional groups, a hierarchy can be built representing
|
|
the failure mode behaviour of a system. Because the hierarchy maintains information
|
|
linking the symptoms to component failure modes (via {\fcs}), reasoning connections from base component failures to top level failures can now be made
|
|
by tracing cause and effect though the hierarchy of modules~\footnote{This means that an FMMD model can be used to produce traditional FMEA reports where each {\bc} {\fm} is linked to
|
|
a system level failure.}.
|
|
%The traceability should satisfy criterion 5.
|
|
An advantage of performing FMEA in this modular way, is that the
|
|
{\fgs} are small in terms of the numbers of components. This means the $O(N^2)$ effect
|
|
of the reasoning distance is greatly reduced for the overall project.
|
|
This addresses the state explosion problem of XFMEA.
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%FFT%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\footnote{In the field of digital signal processing there is an algorithm that revolutionised
|
|
access to frequency analysis of digital samples called the Fast Fourier Transform (FFT)~\cite{fftoriginal}.
|
|
This took the Discrete Fourier Transform (DFT), and applied de-composition to its
|
|
mesh of (often repeated) complex number calculations~\cite{fpodsadsp}[Ch.8].}
|
|
%
|
|
By doing this it breaks the computing order of complexity down from having a polynomial %n exponential
|
|
%order
|
|
to logarithmic order~\cite{ctw}[pp.401-3].
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%FFT%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
It also means that {\fgs} are re-usable (analogous to software classes).
|
|
%
|
|
Where there are repeated sections of circuitry (as in for instance common types of interface)
|
|
the analysis for that module may be simply re-used.
|
|
%
|
|
%
|
|
A practical example of a hardware FMEA performed both traditionally and using FMMD may be found in~\cite{syssafe2011}, a hybrid
|
|
software and hardware hybrid example is analysed in~\cite{syssafe2012}
|
|
and examples of `reasoning~distance' efficiency savings can be found in~\cite{clark}[Ch.7].
|
|
%
|
|
\subsection{Integrating software into the FMMD model.}
|
|
%
|
|
%With modular FMEA i.e. FMMD %(FMMD)
|
|
%the concepts of failure~modes
|
|
%of components, {\fgs} and symptoms of failure have been defined. % for a functional group.
|
|
%
|
|
A programmatic function has similar attributes to an FMMD {\fg}. % with these concepts. %a {\fg} as defined by the FMMD process.
|
|
%
|
|
An FMMD {\fg} is placed into a hierarchy, likewise
|
|
a software function is typically placed into the hierarchy of its call-tree.
|
|
%
|
|
A software function calls other functions and uses data sources %via hardware interaction
|
|
which could be viewed as its `components':
|
|
it has outputs, i.e. it can perform actions on data or hardware.
|
|
%which will be used by other functions that may call it.
|
|
%
|
|
It is shown below that a software function can be mapped to an FMMD {\fg}: its failure modes
|
|
are the failure modes of the software components %(other functions
|
|
it calls %)
|
|
and/or the hardware from which it reads values.
|
|
Its outputs are the data it changes, or the hardware actions it performs.
|
|
%%
|
|
%% Talk about how software specification will often say how hardware
|
|
%% will react and how to interpret readings---but they do not
|
|
%% always cover the failure modes of the hardware being interfaced too.
|
|
%
|
|
When a
|
|
software function has been analysed---using failure conditions of its inputs as a source of failure modes---its symptoms of failure
|
|
can be defined (i.e. how functions that call it will see its failure mode behaviour).
|
|
%
|
|
%
|
|
FMMD is applied to software functions by viewing functions in terms of their failure mode behaviour.
|
|
%
|
|
That is to say, using FMMD, software functions are treated like {\fgs} of electronic components.
|
|
%
|
|
%
|
|
As software already fits into a hierarchy, there one less analysis decision to make when compared
|
|
to analysing electronics.
|
|
%
|
|
For electrical and mechanical systems, although the original system designers
|
|
concepts of modularity and sub-systems in design may provide guidance,
|
|
applying FMMD means deciding on the members for {\fgs} and the subsequent hierarchy.
|
|
|
|
\paragraph{Contract Programming and FMMD.}
|
|
%
|
|
With electronic components, the literature points to suitable sets of
|
|
{\fms}~\cite{fmd91}~\cite{mil1991}~\cite{en298}. %~\cite{en61508}~\cite{en298}.
|
|
%
|
|
With software only some library functions are well known and rigorously documented
|
|
enough to have the equivalent of known failure modes,
|
|
most software is `bespoke'.
|
|
%
|
|
A different strategy is required to
|
|
describe the failure mode behaviour of software functions; %.
|
|
concepts from contract programming can be used to assist in this. % here.
|
|
|
|
\subsection{Contract programming description}
|
|
%\fmmdglossCONTRACTPROG
|
|
Contract programming~\cite{dbcbe} is a discipline for building software functions in a controlled
|
|
and traceable way. Each function is subject to pre-conditions (constraints on its inputs),
|
|
post-conditions (constraints on its outputs) and function wide invariants (rules).
|
|
|
|
|
|
\paragraph{Mapping contract `pre-condition' violations to component failure modes.}
|
|
%\fmmdglossCONTRACTPROG
|
|
A precondition, or requirement for a contract software function
|
|
defines the correct ranges of input conditions for the function
|
|
to operate successfully.
|
|
%
|
|
% C Garret said this was unclear so I have added the following two sentences.
|
|
%
|
|
%If we consider a software function to be a {\fg} in the FMMD sense, i.e.
|
|
A software function is considered to be
|
|
a collection of code, functions called and %values/
|
|
variables used.
|
|
%
|
|
In this way it is similar to an electronic circuit, which is a collection
|
|
of components connected in a specific way.
|
|
%
|
|
Using this analogy for software, the connections are the functions code, and the
|
|
called functions/variables/inputs %and variables
|
|
are the components.
|
|
%
|
|
Erroneous behaviour from called functions and variables/inputs has the same effect as component failure modes
|
|
on an electronic {\fg}.
|
|
%
|
|
%
|
|
If it is considered that %consider the
|
|
called functions and variables/inputs are the components of a function,
|
|
a modular and hierarchical failure mode model
|
|
from existing software can be built.
|
|
%
|
|
Thus for FMMD applied to software, a violation of a pre-condition is considered to be equivalent to a failure mode of `one of its components'.
|
|
%
|
|
\paragraph{Mapping contract `post-condition' violations to symptoms.}
|
|
%\fmmdglossCONTRACTPROG
|
|
%
|
|
A post-condition is a definition of correct behaviour of a function.
|
|
%
|
|
A violated post-condition is a symptom of failure, or, in FMMD terms a derived failure mode, for a function.
|
|
%
|
|
Post conditions could relate to either actions performed (i.e. the state of hardware changed) or an output value of a function.
|
|
%
|
|
In pure contract programming, a violation of a pre-condition would cause the function to \textbf{not} be executed.
|
|
%
|
|
In implementation code, a pre-condition violation should cause
|
|
an error to be generated, and thus a post-condition to fail.
|
|
%
|
|
A function can fail for reasons other than corruption of its input data (i.e.
|
|
failure caused by variables it uses or return values from functions it calls).
|
|
%
|
|
Variables can become corrupted, by radiation affecting RAM~\cite{5488118,5963919} or
|
|
by another software function erroneously overwriting variables~\cite{swseatbelt}.
|
|
%
|
|
Current work on software FMEA generally focuses on mapping
|
|
variable corruption to failure modes~\cite{procsfmea,procsfmeadb,sfmeaauto,sfmea}.
|
|
However, errors other than variable corruption can occur.
|
|
%
|
|
For instance a microprocessor may have subtle bugs in its instruction set, or
|
|
incorrectly handled
|
|
interrupt contention~\cite{concurrency_c_tool} which could cause side effects in software.
|
|
%
|
|
For the failure mode model of any software function,
|
|
it must be considered that all failure modes defined by post-condition
|
|
violations could simply occur.
|
|
%`components'.
|
|
%
|
|
\paragraph{Mapping contract `invariant' violations to symptoms and failure modes.}
|
|
Invariants are conditions that are considered to be relied on throughout the execution of
|
|
a program.
|
|
%
|
|
Here they are taken to mean invariants applying to data
|
|
or conditions that the function under analysis deals with or could be affected by.
|
|
%
|
|
Invariants in contract programming may apply to inputs to the function (where violations can be considered {\fms} in FMMD terminology),
|
|
and to outputs (where violations can be considered symptoms, or derived {\fms}, in FMMD terminology).
|
|
%\fmmdglossCONTRACTPROG
|
|
|
|
|
|
|
|
|
|
%
|
|
\section{Example for analysis} % : How can we apply FMEA}
|
|
% %
|
|
The example chosen is the smallest meaningful embedded system that the author is familiar with.
|
|
A standalone temperature controller.
|
|
This consists of both hardware and software.
|
|
The hardware elements are sensors (temperature sensing Pt100 resistors),
|
|
an actuator (an output to switch on and off a heater)
|
|
and some indicator LEDs.
|
|
|
|
The software reads the temperature from the sensor and applies checks
|
|
to detect any failures.
|
|
The software then applies a PID~\cite{dcods} algorithm to determine the length/modulation of the pulses applied to the heater.
|
|
|
|
%yourdon context diagram here
|
|
|
|
|
|
|
|
|
|
\subsection{Closed Loop Control Hardware/Software Hybrid Example}
|
|
|
|
%It is desirable to model a complete standalone system with FMMD,
|
|
%not only a standalone system, but ideally a hybrid software/hardware system.
|
|
%
|
|
Temperature control is typically a first order differential problem, and is often
|
|
addressed using the Proportional Integral Differential (PID) algorithm~\cite{dcods}[p.66].
|
|
%
|
|
Traditionally this was performed in analogue electronics
|
|
with trimmer potentiometers providing the P, I and D parameters.
|
|
%
|
|
Since the introduction of digital computers, it has been possible to
|
|
implement PID in software. %pro-grammatically.
|
|
%
|
|
A PID temperature controller is presented
|
|
as a complete example of an electronic/hardware hybrid analysed using FMMD. %would mean an
|
|
%analysis of a realistic standalone system without being it becoming an un-wieldingly large task.
|
|
% % \paragraph{The PID Temperature Control Algorithm.}
|
|
% % PID control starts with a setpoint, or desired value for a process
|
|
% % (here the temperature). It reads the process value and determines an error value for it.
|
|
% % The aim of the PID controller is to minimise this error term, by setting an output value,
|
|
% % which is fed back into the process (in this example the amount of power to supply the heater).
|
|
% % The error value is integrated and multiplied by an I constant.
|
|
% % A differential of the error value is calculated and multiplied by a D constant.
|
|
% % The error value itself is multiplied by a P constant, and all three of these are added
|
|
% % to obtain the output required.
|
|
% % %
|
|
% % A mathematical description of PID with frequency domain modelling (La-Place transforms etc)
|
|
% % may be found in~\cite{dcods}[Ch.3.3].
|
|
%
|
|
\subsection{Design Stage: Implementation on a micro-controller.}
|
|
%
|
|
When designing a computer program it is often useful to
|
|
start with a system overview.
|
|
A structured analysis `Yourdon' context diagram~\cite{Yourdon:1989:MSA:62004} is presented below, see figure~\ref{fig:context_diagram_PID}.
|
|
A Yourdon context diagram shows an overview of a system, with the data inputs and data outputs.
|
|
The circle in the middle defines the processing applied to those inputs and outputs.
|
|
The context diagram can be later refined by introducing more circles with data paths between them.
|
|
Finally a {\swhw} hierarchy can be derived from a Yourdon diagram, which assists
|
|
in the design of the software (specifically the structure of the call tree and the hardware/software interfaces).
|
|
|
|
%
|
|
\begin{figure}[h]+
|
|
\centering
|
|
\includegraphics[width=400pt]{../../submission_thesis/CH5_Examples/context_diagram_PID.png}
|
|
% context_diagram_PID.png: 818x324 pixel, 72dpi, 28.86x11.43 cm, bb=0 0 818 324
|
|
\caption{Yourdon Context Diagram for a standalone micro-processor implemented PID Temperature Controller.}
|
|
\label{fig:context_diagram_PID}
|
|
\end{figure}
|
|
%
|
|
Using figure~\ref{fig:context_diagram_PID} the system in terms of its data flow is reviewed, starting
|
|
with the data sources (the Pt100 temperature sensor inputs) and the data sinks (the heater output and the LED indicators).
|
|
%
|
|
There are two voltage inputs (for a detailed analysis and discussion of a four wire Pt100 configuration see~\ref{clark}[5]) from the Pt100 temperature sensor.
|
|
%
|
|
For the Pt100 sensor, the voltages it outputs are read and %for
|
|
this requires an ADC and MUX.
|
|
%
|
|
%\fmmdglossADC
|
|
%
|
|
For the output, a Pulse Width Modulator (PWM)\footnote{PWM provides a means to modulate an output i.e. very power levels}
|
|
can be used (this is a common module found on micro-controllers
|
|
facilitating variable power output~\cite{aoe}[p.360]).
|
|
%
|
|
PWM's ADC's and MUX's are commonly built into cheap micro-controllers~\cite{pic18f2523}[Ch.15].
|
|
%
|
|
|
|
%and add more detail, see figure~\ref{fig:context_diagram2_PID}.
|
|
|
|
\begin{figure}[h]+
|
|
\centering
|
|
\includegraphics[width=400pt]{../../submission_thesis/CH5_Examples/context_diagram2_PID.png}
|
|
% context_diagram_PID.png: 818x324 pixel, 72dpi, 28.86x11.43 cm, bb=0 0 818 324
|
|
\caption{Yourdon data flow diagram for PID Temperature Controller identifying initial processing nodes.}
|
|
\label{fig:context_diagram2_PID}
|
|
\end{figure}
|
|
%
|
|
\clearpage
|
|
%
|
|
The Yourdon methodology provides model refinement, by zooming into data transform bubbles, analysing them in more
|
|
depth and creating more paths and transform bubbles which further define the data flow and processing. % required.
|
|
%
|
|
The Yourdon diagram is refined, by adding detail to both the afferent data flow coming through the MUX and ADC on the micro-controller and the efferent
|
|
channelled through a PWM module. %again built into the micro-controller,
|
|
%
|
|
This next stage of model refinement is shown in figure~\ref{fig:context_diagram2_PID}.
|
|
%
|
|
This is refined by looking at or zooming into transform bubbles
|
|
and adding more detail i.e. following the data streams through the process, additional transform bubbles are created as required.
|
|
%%%%%
|
|
|
|
%%%%%
|
|
The lines connecting the `transform~bubbles' define the data passed between them.
|
|
%
|
|
When the data flow analysis is finished, each transform bubble represents a software function.
|
|
%
|
|
Because the connecting lines define the data passed between transform bubbles,
|
|
the inputs and outputs of the associated software functions are also defined.
|
|
%
|
|
The Yourdon methodology thus allows the refinement and modelling
|
|
of a process from a data~flow perspective
|
|
defining software functions in its final stage (see figure~\ref{fig:contextsoftware}).
|
|
%, and
|
|
%this in terms of software functions.
|
|
%
|
|
In all `bare~metal'\footnote{`Bare~metal' is a term used to indicate a micro-processor
|
|
controlled system that does not use a traditional operating system. These are generally
|
|
coded in 'C' or assembly language and run immediately from power-up.}
|
|
software architectures, a rudimentary operating system is required, often referred to as the `monitor'.
|
|
%
|
|
The `monitor' function calls the PID function at a regular and precise interval.
|
|
%
|
|
The PID function, because the algorithm depends heavily on integral calculus~\cite{dcods}[Ch.3.3] is time sensitive
|
|
and it is necessary to execute it at precise intervals determined by its proportional, integral and differential (PID) coefficients.
|
|
%
|
|
Most micro-controllers feature several general purpose timers~\cite{pic18f2523}.
|
|
%
|
|
An internal timer can be used in conjunction with the monitor function
|
|
to call the PID algorithm at a regular and precise time interval. % specified interval.
|
|
%
|
|
\paragraph{Data flow model to programmatic call tree.}
|
|
The Yourdon methodology also gives guidance as to which software
|
|
functions should be called to control a process, or in `C' terms be the main function.
|
|
%
|
|
\begin{figure}[h]
|
|
\centering
|
|
\includegraphics[width=400pt]{../../submission_thesis/CH5_Examples/context_software.png}
|
|
% context_software.png: 1023x500 pixel, 72dpi, 36.09x17.64 cm, bb=0 0 1023 500
|
|
\caption{Final Yourdon data flow diagram which has defined the software functions for the PID temperature controller}
|
|
\label{fig:contextsoftware}
|
|
\end{figure}
|
|
%
|
|
Using figure~\ref{fig:contextsoftware} the transform bubble
|
|
to represent the `main' or controlling function in the software must be chosen.
|
|
%
|
|
All software functions will be written in bold with a pair of brackets
|
|
to distingish them as such. The `C' main function is thus presented as \cf{main}.
|
|
%
|
|
This can be thought of as picking one bubble and holding it up.
|
|
%
|
|
The other bubbles hang underneath
|
|
forming the software call tree hierarchy, see figure~\ref{fig:context_calltree}.
|
|
%
|
|
From examining the diagram, and in common with established embedded programming practise,
|
|
this is clearly going to be the \cf{monitor} function.
|
|
%
|
|
\begin{figure}[h]+
|
|
\centering
|
|
\includegraphics[width=300pt]{../../submission_thesis/CH5_Examples/context_calltree.png}
|
|
% context_calltree.png: 800x783 pixel, 72dpi, 28.22x27.62 cm, bb=0 0 800 783
|
|
\caption{Software: Yourdon data flow diagram converted to programatic call tree.}
|
|
\label{fig:context_calltree}
|
|
\end{figure}
|
|
%
|
|
%
|
|
\paragraph{Software Algorithm.}
|
|
%
|
|
The monitor function will orchestrate the control process.
|
|
%
|
|
Firstly it will examine the timer value, and when appropriate, call the \cf{PID} function.
|
|
%
|
|
The \cf{PID} function calls \cf{determine\_set\_point\_error} which calls \cf{convert\_ADC\_to\_T}
|
|
which in turn calls \cf{Read\_ADC} (a function developed and analysed using FMMD in~\cite{syssafe2012})
|
|
which reads from hardware.
|
|
%
|
|
With the set point error value\footnote{In the field of controll engineering the setpoint error value ids often simply referred to as the `error' term.
|
|
In this context it is the difference between the target temperature and the temperature read. For instance were the target temperature
|
|
to be $200^{\circ} C$ and the temperature read to be $150^{\circ} C$ the error value would be $-50^{\circ} C$}
|
|
the \cf{PID} function will return an output control value to its calling
|
|
function (i.e. the PID demand which will be returned to the monitor function).
|
|
%
|
|
%On returning to the monitor function, it will return the PID demand value.
|
|
The PID demand value will be applied via the pulse width modulation (PWM) module.
|
|
%
|
|
%A rudimentary closed loop control system incorporating both hardware and software has been defined.
|
|
With the hardware and software elements defined, and incorporated into a hierarchy, the
|
|
overall structure of the temperature controlled has now been designed.
|
|
%
|
|
By using the Yourdon methodology a programmatic design frame-work i.e. a call tree structure was obtained.
|
|
%
|
|
All the components, i.e. hardware elements and software functions
|
|
that will be used in the temperature controller are now defined.
|
|
%
|
|
As each of these elements can be analysed as components in FMMD
|
|
analysis can begin.
|
|
%
|
|
%These are listed, and from the bottom-up, FMMD analysis is begun.
|
|
%
|
|
\clearpage
|
|
%\subsection{FMMD Analysis of PID temperature Controller}
|
|
%
|
|
|
|
%
|
|
\subsection{Temperature Controller Hardware Elements FMMD.}
|
|
%
|
|
|
|
The hardware elements of this project have been analysed using FMMD
|
|
and can be found in~\cite{clark, syssafe2011, syssafe2012}.
|
|
Being able to re-use analysis work is another advantage of using modularised FMEA.
|
|
Even complex constructs such as
|
|
circuitry connected to a {\uP} which reads voltages
|
|
into {\sw} functions can be picked up from one project and simply re-used in another.
|
|
|
|
To summarise from the design stage,
|
|
the electronic components identified thus far:
|
|
\begin{itemize}
|
|
\item ADCMUX --- Internal micro controller multiplexer and analogue to digital converter,
|
|
\item TIMER --- Internal micro controller timer,
|
|
\item HEATER --- Heating element, essentially a resistor,
|
|
\item Pt100 --- Pt100 Temperature sensor,
|
|
\item PWM --- Internal micro controller pulse width modulation module,
|
|
\item General Purpose I/O (GPIO) --- I/O used to drive LEDS, %. %source LED current
|
|
\item LEDs --- Indication LEDs via GPIO,
|
|
\item micro-controller --- the medium for running the software.
|
|
\end{itemize}
|
|
|
|
Each electronic {\dc} will be described and cited in more detail below.
|
|
|
|
\paragraph{ADCMUX and Read\_ADC.}
|
|
The {\dc} from \cite{syssafe2012} is re-used for this analysis. %section~\ref{readADC}.
|
|
This analysis was performed on a `C' function which
|
|
read a value from an analogue to digital converter (ADC) hardware element.
|
|
The analysis revealed that it could fail in three ways.
|
|
$$ fm(RADC) = \{ VV\_ERR, HIGH, LOW \} .$$
|
|
%
|
|
%
|
|
\paragraph{TIMER.}
|
|
%
|
|
The internal timer, from a programmer's perspective is a register, which when read
|
|
returns an incremented time value.
|
|
%
|
|
Essentially its a free running integer counter with an interfacing register.
|
|
%
|
|
Using two's complement mathematics, by subtracting
|
|
the time last read value, we can calculate the interval
|
|
between readings (assuming the timer has not wrapped around more than once).
|
|
%
|
|
A timer can fail by
|
|
incrementing its value at an incorrect rate, or can stop incrementing.
|
|
%
|
|
The failure modes of $TIMER$ are defined thus:
|
|
$$ fm(TIMER) = \{ STOPPED, INCORRECT\_INTERVAL \}.$$
|
|
%
|
|
\paragraph{HEATER.}
|
|
A heating element is typically some configuration of resistive wire.
|
|
It therefore has the same failure modes as a resistor:
|
|
$$fm(HEATER) = \{ OPEN, SHORT \} .$$
|
|
%
|
|
\paragraph{Pt100 Platinum Temperature Sensor.}
|
|
|
|
The four wire Pt100 configuration is commonly used in safety critical designs.
|
|
For single failure analysis this circuit has only one failure mode.
|
|
The Pt100 four wire configuration was analysed in section~\ref{clark}[5.6], the {\dc} is re-used here:
|
|
$$ fm(Pt100) = \{ OUT\_OF\_RANGE \} . $$
|
|
%
|
|
%
|
|
\paragraph{PWM.}
|
|
%The PWM, in use, is a hardware register written to with an integer value~\cite{pic182523}[Ch.15].
|
|
From a programmatic perspective a PWM output is a register to which software writes
|
|
an unsigned magnitude value~\cite{pic18f2523}[Ch.15].
|
|
%
|
|
The PWM hardware module
|
|
applies this using a mark space ratio proportional to that value, providing
|
|
a means of varying the amount of power supplied.
|
|
%
|
|
When the PWM action is halted, or fails, the digital output pin associated with it
|
|
will typically be held in a high or low state.
|
|
%
|
|
The PWM has the following failure modes:
|
|
$$ fm(PWM) = \{ HIGH, LOW \}.$$
|
|
|
|
\paragraph{Micro-Controller.}
|
|
The Micro controller is a complex piece of highly integrated electronics.
|
|
%
|
|
At a minimum it would include a micro-processor with PROM and RAM
|
|
general I/O and external interrupt lines.
|
|
%
|
|
Typically there are many other I/O modules incorporated (e.g. TIMERS, UARTS, PWM, ADC, ADCMUX, CAN).
|
|
%
|
|
In this project the ADCMUX, TIMER, PWM and general purpose computing facilities are used.
|
|
%
|
|
Consider the general~computing, CLOCK, PROM and RAM failure modes:
|
|
$$fm (micro-controller) =\{ PROM\_FAULT, RAM\_FAULT, CPU\_FAULT, ALU\_FAULT, CLOCK\_STOPPED \}.$$
|
|
%
|
|
\subsection{Temperature Controller Software Elements FMMD}
|
|
Identified Software Components:
|
|
\begin{itemize}
|
|
\item --- \cf{Monitor} (which calls \cf{PID},\cf{output\_control} and \cf{setLEDS}),
|
|
\item --- \cf{PID} (which calls \cf{determine\_set\_point\_error} ),
|
|
\item --- \cf{determine\_set\_point\_error} (which calls \cf{convert\_ADC\_to\_T}),
|
|
\item --- \cf{convert\_ADC\_to\_T} (which calls \cf{read\_ADC}), % which has been analysed as the {\dc} read\_ADC which can be re-used.} % from the last example),
|
|
\item --- \cf{read\_ADC} (analysed in the previous section~\ref{syssafe2012}),
|
|
\item --- \cf{output\_control} (which sets the PWM hardware according to the PID demand value).
|
|
\end{itemize}
|
|
%
|
|
%
|
|
With the call tree structure defined (see figure~\ref{fig:context_calltree}),
|
|
a hierarchy compatible with FMMD for analysis has been obtained.
|
|
%
|
|
However, it is only the top, i.e. the software, part of the hierarchy.
|
|
%
|
|
FMMD is a bottom-up process, thus it starts with the lowest level, i.e. the electronics.
|
|
%
|
|
The Yourdon context diagram (see figure~\ref{fig:context_diagram_PID}) is useful here as its data sources and sinks are
|
|
by definition the lowest levels in the system.
|
|
%
|
|
The input, or origin of the afferent data flow can be followed to find system inputs,
|
|
and the output, or efferent flow to find the bottom level for outputs/actuators etc.
|
|
%
|
|
Starting with the afferent flow, the reading of the temperature and its conversion
|
|
to a PID calculated heater output demand is examined.
|
|
%
|
|
\subsubsection{Afferent flow FMMD analysis, Pt100, temperature, set point error, PID output demand.}
|
|
%
|
|
Starting with the afferent data flow for the temperature readings, the lowest
|
|
level in the hierarchy is found, the Pt100 sensor.
|
|
%with the software, and consider the hardware elements
|
|
%used (if any) by each software function.
|
|
%Starting
|
|
|
|
|
|
|
|
|
|
%
|
|
%
|
|
%
|
|
%The {\dc} Read\_Pt100 is obtained from analysis of a {\fg} comprising of the
|
|
%\cf{Read\_ADC} software function and the Pt100 hardware.
|
|
%The {\dc} Read\_Pt100 is a failure mode model of the \cf{Read\_ADC} software function and the Pt100
|
|
%hardware, this
|
|
The {\dc} convert\_ADC\_to\_T has the following failure modes:
|
|
%
|
|
$$ fm (convert\_ADC\_to\_T) = \{ VOLTAGE\_HIGH, VAL\_ERR, VOLTAGE\_LOW \}. $$
|
|
%
|
|
%
|
|
Moving along the afferent flow, the \cf{convert\_ADC\_to\_T} function is next up the hierarchy.
|
|
%
|
|
This will call \cf{Read\_ADC} twice, once for the high Pt100 value, again for the lower. % and once for to read a current sense.
|
|
%
|
|
The resistance of the Pt100 element is then calculated, and with this---using a
|
|
polynomial or a lookup table~\cite{eurothermtables}---the temperature determined.
|
|
%
|
|
%\fmmdglossCONTRACTPROG
|
|
%
|
|
The pre-conditions for the function are that:
|
|
\begin{itemize}
|
|
% \item The current calculated is within pre-defined bounds i.e. Pt100\_current,
|
|
\item The lower Pt100 value is within an acceptable voltage range i.e. Pt100\_lower\_voltage,
|
|
\item The higher Pt100 value is within an acceptable voltage range i.e. Pt100\_higher\_voltage,
|
|
\item The lower and higher values agree to within a given tolerance i.e. Pt100\_high\_low\_mismatch.
|
|
\end{itemize}
|
|
%
|
|
Any violation of these pre-conditions is equivalent to a failure mode\footnote{An actual measured temperature outside the
|
|
pre-defined range would be detected as an unacceptable voltage range failure.}.
|
|
%
|
|
The post-condition is that it returns a temperature within a given tolerance to the temperature at the sensor.
|
|
%
|
|
A failure of this post-condition can be termed `temp\_incorrect'.
|
|
%
|
|
\clearpage
|
|
Applying FMMD to the {\fg} formed by \cf{Read\_Pt100} and the function \cf{convert\_ADC\_to\_T}.
|
|
gives the {\dc} {Get\_Temperature}.
|
|
%
|
|
This analysis is presented in table~\ref{tbl:gettemperature}.
|
|
%
|
|
Beginning at the bottom, a {\fg} is formed with
|
|
the function \cf{read\_ADC} and the Pt100: this coresponds to the `C' function
|
|
\cf{convert\_ADC\_to\_T}.
|
|
When FMMD analysis has been performed this gives a {\dc}, named
|
|
`convert\_ADC\_to\_T'. % (see appendix~\ref{sec:readPt100}).
|
|
|
|
|
|
\begin{table}[h+]
|
|
\center
|
|
|
|
\caption{ convert\_ADC\_to\_T: Failure Mode Effects Analysis} % title of Table
|
|
\label{tbl:gettemperature}
|
|
|
|
\begin{tabular}{|| l | l | c | l ||} \hline
|
|
% \textbf{Failure} & \textbf{failure} & \textbf{Symptom} \\
|
|
% \textbf{Scenario} & \textbf{effect} & \textbf{RADC } \\ \hline
|
|
\hline
|
|
\rowcolor{LightCyan}
|
|
\textbf{Component} & \textbf{Failure} & \textbf{Failure } & \textbf{Symptom} \\
|
|
\rowcolor{LightCyan}
|
|
\textbf{} & \textbf{cause} & \textbf{Effect} & \\
|
|
|
|
|
|
\hline
|
|
Pt100 & FC1: out of range reading & Pt100 voltage & $VOLTAGE\_HIGH$ \\
|
|
& from the Pt100 resistors. & outside range & $VOLTAGE\_LOW$ \\ \hline
|
|
|
|
\cf{read\_ADC} & FC2: $RADC_{VV_ERR}$ & voltage & $VAL\_ERR$ \\
|
|
& ADC failure & \\ \hline \hline
|
|
|
|
|
|
|
|
\cf{read\_ADC} & FC3: $RADC_{HIGH}$ & voltage value & $VOLTAGE\_HIGH$ \\
|
|
& ADC reads High & \\ \hline
|
|
|
|
|
|
|
|
\cf{read\_ADC} & FC4: $RADC_{LOW}$ & voltage value & $VOLTAGE\_LOW$ \\
|
|
& ADC reads low & from ADC value low & \\ \hline
|
|
|
|
\cf{read\_ADC} & FC5: post condition fails & software failure in & $VAL\_ERR$ \\
|
|
& in function \cf{read\_ADC} & \cf{read\_ADC} & \\ \hline
|
|
|
|
\cf{convert\_ADC\_to\_T} & FC6: post condition fails & software failure in & $VAL\_ERR$ \\
|
|
& in function \cf{convert\_ADC\_to\_T} & \cf{convert\_ADC\_to\_T} & \\ \hline
|
|
|
|
\end{tabular}
|
|
\end{table}
|
|
|
|
|
|
|
|
%\fmmdglossADC
|
|
Collecting symptoms from table~\ref{tbl:tbl:gettemperature}, the {\dc} $convert\_ADC\_to\_T$ is assigned the following failure modes:
|
|
$$
|
|
fm(convert\_ADC\_to\_T) = \{ VOLTAGE\_HIGH , VOLTAGE\_LOW, VAL\_ERR\} .
|
|
$$
|
|
%The analysis for the Pt100 circuit is presented in table~\ref{tbl:readPt100}.
|
|
%
|
|
%
|
|
%
|
|
Following the afferent flow further, the function to determine the control error value is examined.
|
|
%
|
|
This is simply the target temperature subtracted from that measured by the sensor.
|
|
%
|
|
A {\fg} is formed with the newly formed {\dc} convert\_ADC\_to\_T
|
|
and the function \cf{determine\_set\_point\_error}.
|
|
%
|
|
The pre-condition for \cf{determine\_set\_point\_error} is that the temperature read by it
|
|
is accurate, and its post-condition is to return the correct control error value.
|
|
%
|
|
%All single failure modes from a four wire Pt100 sensor are detectable (see section~\ref{sec:singlePt100FMEA}).
|
|
%
|
|
%For most practical purposes this would suffice, but for the purpose of example
|
|
%a particular double failure scenario, potentially giving an undefined value is
|
|
%considered (see section~\ref{sec:Pt100floating}).
|
|
%
|
|
The post-condition can fail, or the temperature read could be incorrect.
|
|
%
|
|
This could be detectable (i.e. the symptoms $\{ VOLTAGE\_HIGH , VOLTAGE\_LOW \} $
|
|
or undetectable (i.e. the post condition for this function simply fails
|
|
or the failure mode $VAL\_ERR$ occurs).
|
|
%and so an incorrect value that is detected, KnownIncorrectErrorValue
|
|
%where we can detect the Pt100 value is suspect,
|
|
%and IncorrectErrorValue where there is simply
|
|
%an incorrect value but this cannot be determined (i.e. its an undetectable failure). % this.
|
|
%
|
|
This analysis is presented in table~\ref{tbl:geterror}.
|
|
%
|
|
%
|
|
%
|
|
\begin{table}[h+]
|
|
\center
|
|
\caption{ GetError: Failure Mode Effects Analysis} % title of Table
|
|
\label{tbl:geterror}
|
|
|
|
\begin{tabular}{|| l | l | c | l ||} \hline
|
|
% \textbf{Failure} & \textbf{failure} & \textbf{Symptom} \\
|
|
% \textbf{Scenario} & \textbf{effect} & \textbf{RADC } \\ \hline
|
|
\hline
|
|
\rowcolor{LightCyan}
|
|
\textbf{Component} & \textbf{Failure} & \textbf{Failure } & \textbf{Symptom} \\
|
|
\rowcolor{LightCyan}
|
|
\textbf{} & \textbf{cause} & \textbf{Effect} & \\
|
|
|
|
|
|
\hline
|
|
convert\_ADC\_to\_T & FC1: VOLTAGE\_HIGH & detectable failure & $KnownIncorrectErrorValue$ \\ \hline
|
|
convert\_ADC\_to\_T & FC2: VOLTAGE\_LOW & detectable failure & $KnownIncorrectErrorValue$ \\ \hline
|
|
convert\_ADC\_to\_T & FC3: VAL\_ERR & detectable failure & $IncorrectErrorValue$ \\ \hline
|
|
|
|
|
|
\cf{GetError} & FC6: post condition fails & software failure in & $IncorrectErrorValue$ \\
|
|
& in function \cf{GetError} & \cf{GetError} & \\ \hline
|
|
|
|
\end{tabular}
|
|
\end{table}
|
|
%
|
|
Failure mode symptoms are collected and a new {\dc} GetError created
|
|
where:
|
|
$$fm(GetError) = \{ KnownIncorrectErrorValue, IncorrectErrorValue \}.$$
|
|
%
|
|
Following the afferent path the PID algorithm is next in the software call tree.
|
|
%
|
|
%Here we assume that the PID constants are fixed (i.e. are not parameters).
|
|
%
|
|
The $GetError$ {\dc} and the \cf{PID} function form a {\fg}.
|
|
%
|
|
The pre-condition for the \cf{PID} function is that
|
|
it receives the correct error value.
|
|
%
|
|
The post-condition is that it outputs correct control values.
|
|
% RESP FOR TIMEING IS ON CALLING FUNCTION AND IS A SEPARATE ERROR- TGHINK ABOUT JITTER.....
|
|
% and controll values..... Jitter might not matter, wrong int times would
|
|
% controlling function provdes context of use.
|
|
%Those familiar with the PID algorithm may realise that digital signal processing algorithms are sensitive to calling frequency.
|
|
All digital signal processing algorithms are sensitive to calling frequency, and thus should be time invariant~\cite{fpodsadsp}[p.58].
|
|
Were this function to be called at an incorrect rate, its output
|
|
could be erroneous (the differential and integral parameters would effectively have been changed).
|
|
%
|
|
However this problem is a failure mode for the consideration of the function calling it i.e. the context of use. %(see section~\ref{sec:subjectiveobjective}).
|
|
%
|
|
That is, the \cf{PID} function is called, but its calling function is responsible for the timing,
|
|
or in more general terms,
|
|
it is the calling function that sets the context for the \cf{PID} function (i.e. what it is used for).
|
|
%If this PID were to be used, say as some form of low pass filter, we could consider jitter
|
|
%for instance.
|
|
%
|
|
%In a control environment with PID, jitter would not be a significant factor.
|
|
%
|
|
%HARK THE HERALD ANGELS SING... HARK????
|
|
%
|
|
%
|
|
|
|
\begin{table}[h+]
|
|
\center
|
|
\center
|
|
|
|
\label{tbl:geterror}
|
|
\caption{ PID: Failure Mode Effects Analysis} % title of Table
|
|
\begin{tabular}{|| l | l | c | l ||} \hline
|
|
% \textbf{Failure} & \textbf{failure} & \textbf{Symptom} \\
|
|
% \textbf{Scenario} & \textbf{effect} & \textbf{RADC } \\ \hline
|
|
\hline
|
|
\rowcolor{LightCyan}
|
|
\textbf{Component} & \textbf{Failure} & \textbf{Failure } & \textbf{Symptom} \\
|
|
\rowcolor{LightCyan}
|
|
\textbf{} & \textbf{cause} & \textbf{Effect} & \\
|
|
|
|
|
|
\hline
|
|
GetError & FC1: $KnownIncorrectErrorValue$ & safe heater control & KnownControlValueErrorV \\ \hline
|
|
GetError & FC2: $KnownIncorrectErrorValue$ & safe heater control & KnownControlValueErrorV \\ \hline
|
|
GetError & FC3: $IncorrectErrorValue$ & unsafe heater control & IncorrectControlErrorV \\ \hline
|
|
|
|
|
|
\cf{GetError} & FC6: post condition fails & software failure in & IncorrectControlErrorV \\
|
|
& in function \cf{GetError} & \cf{GetError} & \\ \hline
|
|
|
|
\end{tabular}
|
|
\end{table}
|
|
%%
|
|
The {\dc} PID is created, see table~\ref{tbl:pidfunction}, with the following failure modes:
|
|
%
|
|
$$ fm(PID) = \{ KnownControlValueErrorV, IncorrectControlErrorV \} .$$
|
|
%
|
|
To add some perspective here, if the failure mode is detectable i.e. $KnownControlValueErrorV$
|
|
it is the responsibility of the calling function to act on this and attempt a safety measure (in this case to turn off the heater and indicate an error condition).
|
|
Where the failure mode is not detectable the control will
|
|
in all likelihood apply an incorrect output.
|
|
Part of FMMD failure analysis is recognising that a design may have
|
|
undetectable errors. re-design of the modules affected can be identified by their position in the hierarchy
|
|
and thus solutions devised.
|
|
|
|
|
|
|
|
|
|
\begin{figure}[h]
|
|
\centering
|
|
\includegraphics[width=400pt]{../../submission_thesis/CH5_Examples/euler_afferent_PID.png}
|
|
% euler_afferent_PID.png: 1002x342 pixel, 72dpi, 35.35x12.06 cm, bb=0 0 1002 342
|
|
\caption{Euler diagram representing the hierarchy of FMMD analysis applied to the afferent branch of call tree for the PID temperature controller example.}
|
|
\label{fig:euler_afferent_PID}
|
|
\end{figure}
|
|
%
|
|
%
|
|
%
|
|
The software call tree for the afferent flow has now been modelled using FMMD;
|
|
this is represented as an Euler diagram in figure~\ref{fig:euler_afferent_PID}.
|
|
Two call tree branches remain. The LED indication branch and the
|
|
PWM/heater output.
|
|
%
|
|
\subsubsection{Efferent flow, PID demand value to PWM output}
|
|
%
|
|
The efferent dataflow, or outputs are now analysed.
|
|
%
|
|
The monitor function calls the \cf{output\_control} function with the PID demand.
|
|
%
|
|
The \cf{output\_control} function then sets the PWM hardware register, which causes the mark space output of the PWM module to
|
|
apply the demanded power.
|
|
%
|
|
A {\fg} with the Heating element, a PWM module and the \cf{output\_control} function is formed to model this branch
|
|
of the efferent flow.
|
|
%
|
|
This {\fg} is a hardware/software hybrid.
|
|
%
|
|
FMMD analysis is applied to this {\fg} in table~\ref{tbl:heateroutput}.
|
|
%
|
|
For the \cf{output\_control} function, there is a pre-condition that the PWM module is
|
|
configured and working, and has the correct clock frequency.
|
|
%
|
|
A second pre-condition is that the heating element is connected and working.
|
|
%
|
|
The post-condition is that it sets the correct value into the PWM register
|
|
to implement the power output demand.
|
|
%
|
|
%
|
|
%
|
|
A {\dc} is created called HeaterOutput, see table~\ref{tbl:heateroutput},
|
|
with the following failure modes:
|
|
$$fm(HeaterOutput) = \{ HeaterOnFull, HeaterOff, HeaterOutputIncorrect \} .$$
|
|
%
|
|
As an aside: the $HeaterOnFull$ failure should raise alarm bells for designers and
|
|
upon its discovery, measures may be recommended to inhibit this (such as perhaps
|
|
adding a safety relay to cut the power to the heater if the $HeaterOnFull$ exceeds a given time limit).
|
|
%
|
|
%
|
|
\begin{figure}[h]
|
|
\centering
|
|
\includegraphics[width=300pt]{../../submission_thesis/CH5_Examples/euler_heater_output.png}
|
|
% euler_heater_output.png: 392x141 pixel, 72dpi, 13.83x4.97 cm, bb=0 0 392 141
|
|
\caption{Euler diagram showing HeaterOutput with its two hardware components, PWM and HEATER, and its software component \cf{output\_control}.}
|
|
\label{fig:eulerheateroutput}
|
|
\end{figure}
|
|
%
|
|
%
|
|
%
|
|
%
|
|
\subsubsection{Efferent flow: LED status LEDs}
|
|
%
|
|
The status LEDS will be controlled by general purpose (GPIO) I/O pins.
|
|
%
|
|
Three LEDS could be used, one flashing with a human readable mark
|
|
space ratio representing the heater output, one flashing at a regular interval to
|
|
indicate the processor is alive and another flashing at an interval related to the temperature,
|
|
(to indicate if the temperature readings are within expected ranges).
|
|
%
|
|
Each LED should flash in normal operation, and any LED being permanently on or off
|
|
would indicate to the operator that an error had occurred.
|
|
%
|
|
The pre-condition for this function is that the GPIO
|
|
is connected to working LEDS.
|
|
%
|
|
The post-condition is that the function \cf{setLEDS} will supply correct indication by flashing the LEDs.
|
|
%
|
|
A {\fg} is formed from the GPIO, the LEDs and the software function \cf{setLEDs}.
|
|
%
|
|
FMMD analysis is applied to this {\fg} in table~\ref{tbl:ledoutput}.
|
|
%
|
|
%
|
|
%
|
|
%
|
|
%
|
|
\begin{figure}[h]
|
|
\centering
|
|
\includegraphics[width=300pt]{../../submission_thesis/CH5_Examples/euler_led_output.png}
|
|
% euler_heater_output.png: 392x141 pixel, 72dpi, 13.83x4.97 cm, bb=0 0 392 141
|
|
\caption{Euler diagram showing LEDOutput with its three LEDs and GPIO hardware elements,
|
|
and its software component setLEDS.}
|
|
\label{fig:eulerheateroutput}
|
|
\end{figure}
|
|
%
|
|
%
|
|
The {\dc} for the setLED function, GPIO and LEDs has the following failure modes:
|
|
$$ fm(LEDoutput) = \{FailureIndicated, IndicationError \} $$
|
|
%
|
|
%
|
|
\subsubsection{Final Analysis Stage: PID Temperature Controller}
|
|
%
|
|
The possibility of each software function failing its post-condition without a direct
|
|
underlying cause from one of its components has been included in each analysis stage
|
|
involving software.
|
|
%
|
|
This is because software introduces the possibility of
|
|
anything going wrong!
|
|
%
|
|
The common causes for software failing are:
|
|
\begin{itemize}
|
|
\item Value/RAM corruption typically from interrupt contention problems~\cite{concurrency_c_tool} or accidental over writing~\cite{swseatbelt},
|
|
but can be from external sources such as radiation changing bits/values at runtime~\cite{5963919, 5488118};
|
|
\item Address bus errors leading to program errors (program sequence);
|
|
\item ROM memory failures;
|
|
\item Unintended behaviour of software.
|
|
\item Electro Magnetic Compatibility (EMC) interference.
|
|
\end{itemize}
|
|
Because the software is running on a medium, that of the processor or micro-controller,
|
|
the FMMD analysis at the final or highest level (see table~\ref{tbl:pid}), must include all possible failure modes of this medium i.e.
|
|
|
|
\begin{eqnarray*}
|
|
fm (micro-controller) = \{ PROM\_FAULT, \\ RAM\_FAULT, \\ CPU\_FAULT, \\ ALU\_FAULT, \\ CLOCK\_STOPPED \}.
|
|
\end{eqnarray*}
|
|
The final FMMD stage forms a {\fg} with the {\dcs}
|
|
determined previously:
|
|
%
|
|
\begin{itemize}
|
|
\item the micro-controller,
|
|
\item PID,
|
|
\item HeaterOutput,
|
|
\item LEDoutput,
|
|
\item the function \cf{monitor}.
|
|
\end{itemize}
|
|
%
|
|
The post-condition for the monitor function is that it implements the PID control task correctly.
|
|
%\fmmdglossCONTRACTPROG
|
|
A {\dc} for the standalone temperature controller is now created, and given the name TempController.
|
|
It will have the following failure modes:
|
|
%
|
|
\begin{eqnarray*}
|
|
fm ( TempController ) = \{ ControlFailureIndicated, \\ ControlFailure, \\ KnownIndicationError, \\ UnknownIndicationError \}.
|
|
\end{eqnarray*}
|
|
|
|
%
|
|
%
|
|
The failure mode analysis of the complete PID controller is represented
|
|
as an Euler diagram in figure~\ref{fig:euler_temp_controller}.
|
|
%
|
|
%
|
|
\begin{figure}[h]
|
|
\centering
|
|
\includegraphics[width=400pt]{../../submission_thesis/CH5_Examples/euler_temp_controller.png}
|
|
% euler_temp_controller.png: 714x251 pixel, 72dpi, 25.19x8.85 cm, bb=0 0 714 251
|
|
\caption{Euler diagram of the temperature controller final analysis stage, showing the hybrid software/hardware {\dcs} and the function at the head of the call tree \cf{monitor}.}
|
|
\label{fig:euler_temp_controller}
|
|
\end{figure}
|
|
%
|
|
\subsection{Conclusion: Standalone system, PID Temperature Controller}
|
|
%
|
|
The PID temperature control example above, shows that complete hybrid software/electronic systems can be
|
|
modelled using FMMD.
|
|
%
|
|
The FMMD model can be traversed from undesirable top level failures to the {\bc} {\fms} that are the causes.
|
|
%\fmmdglossOBS
|
|
%%
|
|
This analysis has revealed system level failure modes that are un-handled and some that are undetectable.
|
|
%
|
|
While this may appear poor, with FMMD the undetectable failures and unhandled are actually known: they
|
|
are present in the model because they came from the components {\fms}.
|
|
%
|
|
This means that by using FMMD, the sub-systems which require
|
|
re-design to eliminate or reduce the likelihood of undetectable failure modes can be identified.
|
|
%
|
|
Each system {\fm} of concern can be traced back to the components that caused them.
|
|
%
|
|
The components can be strengthened or additional self diagnostics can be applied to
|
|
alleviate the problems.
|
|
%
|
|
The demands of EN61508~\cite{en61508} for minimum safe failure fraction thresholds~\cite{scsh}[p.52] associated with
|
|
SIL levels, make this a desirable feature of any FMEA based methodology.
|
|
%
|
|
This is because the system {\fms} can be traced back to component {\fms} which
|
|
will should have published reliability statistics~\cite{fmd91}.
|
|
%
|
|
With the reliability statistics the SIL dangerous failure probabilities can be listed and summed
|
|
providing data to classify the SIL level.
|
|
%
|
|
For the failure modes caused
|
|
by electronics, reliability statistics can be applied, and the possibilities of using higher rated
|
|
components instead of potentially expensive re-design can be simulated/modelled.
|
|
%
|
|
For software errors, it may be necessary to provide extra functions to provide self checking.
|
|
%
|
|
EN61508 high reliability software measures such as
|
|
duplication of functions with checking functions arbitrating them (diverse programming~\cite{en61508}[C.3.5]) could be applied.
|
|
%
|
|
For instance, measures may included to validate the processor clocking with an external watchdog and a simple
|
|
communications protocol. For PROM and RAM faults measures such as run-time checksums
|
|
and ram complement checking can be applied.
|
|
%
|
|
%Using FMMD in conjunction with extra safety measures it can be ensured that no single hardware failure could lead to a
|
|
%system failure, something difficult to prove with current FMEA techniques.
|
|
|
|
|
|
%\subsection{Hardware: Sensors, actuators and indication}
|
|
|
|
|
|
%\subsection{Simple Software Example}
|
|
|
|
|
|
%\subsection{Software FMEA - The software/hardware interface}
|
|
|
|
|
|
|
|
\section{Conclusion}
|
|
|
|
%% NEED TO LIST THE WIDH LIST HERE AND ANSWER ALL POINTS
|
|
|
|
Efficiency --- the $O(N^2)$ has been broken down by making it
|
|
several much easier to deal with $O(N^2)$ analysis stages.
|
|
|
|
While there are no FMEA metrics to compare a {\swhw} hybrid
|
|
using FMMD an estimate of the work to perform, the reasoning distance, can be calculated.
|
|
|
|
|
|
The {\swhw} interface is handled naturally. Any {\hw} failures
|
|
can now no longer be missed or forgotten in the analysis process.
|
|
The {\sw} faces no surprise {\hw} errors that it has no sensible
|
|
way of dealing with.
|
|
|
|
Errors introduced by the {\uP} are unresolved in this example but are listed.
|
|
Some errors like RAM are in practical terms impossible to test with a 100\% coverage.
|
|
For other errors self-checking methods linked to external watchdog processors
|
|
can validate correct working of a micro-controller
|
|
and provide a higher degree of statistical integrity.
|
|
|
|
|
|
|
|
Re-useability --- the electronics --- the Pt100 --- s/w functions to read ADC values
|
|
|
|
{
|
|
\footnotesize
|
|
\bibliographystyle{plain}
|
|
\bibliography{../../vmgbibliography,../../mybib}
|
|
}
|
|
\today
|
|
%\today
|
|
\end{document}
|
|
|